unicodeobject.c revision ec233e58038b222ec4cedc07ec46bed1f40468d7
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python.  This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters.  End
17 * of string is given by the length attribute.  However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl  Created
23 * 1999-01-24 fl  Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl  Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl  Moved declarations to separate file, etc.
26 * 1999-06-13 fl  Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl  Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "unicodeobject.h"
68#include "ucnhash.h"
69
70#ifdef MS_WIN32
71#include <windows.h>
72#endif
73
74/* Limit for the Unicode object free list */
75
76#define MAX_UNICODE_FREELIST_SIZE       1024
77
78/* Limit for the Unicode object free list stay alive optimization.
79
80   The implementation will keep allocated Unicode memory intact for
81   all objects on the free list having a size less than this
82   limit. This reduces malloc() overhead for small Unicode objects.
83
84   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
85   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
86   malloc()-overhead) bytes of unused garbage.
87
88   Setting the limit to 0 effectively turns the feature off.
89
90   Note: This is an experimental feature ! If you get core dumps when
91   using Unicode objects, turn this feature off.
92
93*/
94
95#define KEEPALIVE_SIZE_LIMIT       9
96
97/* Endianness switches; defaults to little endian */
98
99#ifdef WORDS_BIGENDIAN
100# define BYTEORDER_IS_BIG_ENDIAN
101#else
102# define BYTEORDER_IS_LITTLE_ENDIAN
103#endif
104
105/* --- Globals ------------------------------------------------------------
106
107   The globals are initialized by the _PyUnicode_Init() API and should
108   not be used before calling that API.
109
110*/
111
112/* The empty Unicode object */
113static PyUnicodeObject *unicode_empty;
114
115/* Free list for Unicode objects */
116static PyUnicodeObject *unicode_freelist;
117static int unicode_freelist_size;
118
119/* Default encoding to use and assume when NULL is passed as encoding
120   parameter; it is initialized by _PyUnicode_Init().
121
122   Always use the PyUnicode_SetDefaultEncoding() and
123   PyUnicode_GetDefaultEncoding() APIs to access this global.
124
125*/
126
127static char unicode_default_encoding[100];
128
129/* --- Unicode Object ----------------------------------------------------- */
130
131static
132int _PyUnicode_Resize(register PyUnicodeObject *unicode,
133                      int length)
134{
135    void *oldstr;
136
137    /* Shortcut if there's nothing much to do. */
138    if (unicode->length == length)
139	goto reset;
140
141    /* Resizing unicode_empty is not allowed. */
142    if (unicode == unicode_empty) {
143        PyErr_SetString(PyExc_SystemError,
144                        "can't resize empty unicode object");
145        return -1;
146    }
147
148    /* We allocate one more byte to make sure the string is
149       Ux0000 terminated -- XXX is this needed ? */
150    oldstr = unicode->str;
151    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
152    if (!unicode->str) {
153	unicode->str = oldstr;
154        PyErr_NoMemory();
155        return -1;
156    }
157    unicode->str[length] = 0;
158    unicode->length = length;
159
160 reset:
161    /* Reset the object caches */
162    if (unicode->defenc) {
163        Py_DECREF(unicode->defenc);
164        unicode->defenc = NULL;
165    }
166    unicode->hash = -1;
167
168    return 0;
169}
170
171int PyUnicode_Resize(PyObject **unicode,
172		     int length)
173{
174    PyUnicodeObject *v;
175
176    if (unicode == NULL) {
177	PyErr_BadInternalCall();
178	return -1;
179    }
180    v = (PyUnicodeObject *)*unicode;
181    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
182	PyErr_BadInternalCall();
183	return -1;
184    }
185    return _PyUnicode_Resize(v, length);
186}
187
188/* We allocate one more byte to make sure the string is
189   Ux0000 terminated -- XXX is this needed ?
190
191   XXX This allocator could further be enhanced by assuring that the
192       free list never reduces its size below 1.
193
194*/
195
196static
197PyUnicodeObject *_PyUnicode_New(int length)
198{
199    register PyUnicodeObject *unicode;
200
201    /* Optimization for empty strings */
202    if (length == 0 && unicode_empty != NULL) {
203        Py_INCREF(unicode_empty);
204        return unicode_empty;
205    }
206
207    /* Unicode freelist & memory allocation */
208    if (unicode_freelist) {
209        unicode = unicode_freelist;
210        unicode_freelist = *(PyUnicodeObject **)unicode;
211        unicode_freelist_size--;
212	if (unicode->str) {
213	    /* Keep-Alive optimization: we only upsize the buffer,
214	       never downsize it. */
215	    if ((unicode->length < length) &&
216		_PyUnicode_Resize(unicode, length)) {
217		PyMem_DEL(unicode->str);
218		goto onError;
219	    }
220	}
221      else {
222	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
223      }
224      PyObject_INIT(unicode, &PyUnicode_Type);
225    }
226    else {
227        unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
228        if (unicode == NULL)
229            return NULL;
230	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
231    }
232
233    if (!unicode->str) {
234	PyErr_NoMemory();
235	goto onError;
236    }
237    unicode->str[length] = 0;
238    unicode->length = length;
239    unicode->hash = -1;
240    unicode->defenc = NULL;
241    return unicode;
242
243 onError:
244    _Py_ForgetReference((PyObject *)unicode);
245    PyObject_DEL(unicode);
246    return NULL;
247}
248
249static
250void _PyUnicode_Free(register PyUnicodeObject *unicode)
251{
252    if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
253        /* Keep-Alive optimization */
254	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
255	    PyMem_DEL(unicode->str);
256	    unicode->str = NULL;
257	    unicode->length = 0;
258	}
259	if (unicode->defenc) {
260	    Py_DECREF(unicode->defenc);
261	    unicode->defenc = NULL;
262	}
263	/* Add to free list */
264        *(PyUnicodeObject **)unicode = unicode_freelist;
265        unicode_freelist = unicode;
266        unicode_freelist_size++;
267    }
268    else {
269	PyMem_DEL(unicode->str);
270	Py_XDECREF(unicode->defenc);
271	PyObject_DEL(unicode);
272    }
273}
274
275PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
276				int size)
277{
278    PyUnicodeObject *unicode;
279
280    unicode = _PyUnicode_New(size);
281    if (!unicode)
282        return NULL;
283
284    /* Copy the Unicode data into the new object */
285    if (u != NULL)
286	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
287
288    return (PyObject *)unicode;
289}
290
291#ifdef HAVE_WCHAR_H
292
293PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
294				 int size)
295{
296    PyUnicodeObject *unicode;
297
298    if (w == NULL) {
299	PyErr_BadInternalCall();
300	return NULL;
301    }
302
303    unicode = _PyUnicode_New(size);
304    if (!unicode)
305        return NULL;
306
307    /* Copy the wchar_t data into the new object */
308#ifdef HAVE_USABLE_WCHAR_T
309    memcpy(unicode->str, w, size * sizeof(wchar_t));
310#else
311    {
312	register Py_UNICODE *u;
313	register int i;
314	u = PyUnicode_AS_UNICODE(unicode);
315	for (i = size; i >= 0; i--)
316	    *u++ = *w++;
317    }
318#endif
319
320    return (PyObject *)unicode;
321}
322
323int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
324			 register wchar_t *w,
325			 int size)
326{
327    if (unicode == NULL) {
328	PyErr_BadInternalCall();
329	return -1;
330    }
331    if (size > PyUnicode_GET_SIZE(unicode))
332	size = PyUnicode_GET_SIZE(unicode);
333#ifdef HAVE_USABLE_WCHAR_T
334    memcpy(w, unicode->str, size * sizeof(wchar_t));
335#else
336    {
337	register Py_UNICODE *u;
338	register int i;
339	u = PyUnicode_AS_UNICODE(unicode);
340	for (i = size; i >= 0; i--)
341	    *w++ = *u++;
342    }
343#endif
344
345    return size;
346}
347
348#endif
349
350PyObject *PyUnicode_FromObject(register PyObject *obj)
351{
352    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
353}
354
355PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
356				      const char *encoding,
357				      const char *errors)
358{
359    const char *s;
360    int len;
361    int owned = 0;
362    PyObject *v;
363
364    if (obj == NULL) {
365	PyErr_BadInternalCall();
366	return NULL;
367    }
368
369    /* Coerce object */
370    if (PyInstance_Check(obj)) {
371	PyObject *func;
372	func = PyObject_GetAttrString(obj, "__str__");
373	if (func == NULL) {
374	    PyErr_SetString(PyExc_TypeError,
375		  "coercing to Unicode: instance doesn't define __str__");
376	    return NULL;
377	}
378	obj = PyEval_CallObject(func, NULL);
379	Py_DECREF(func);
380	if (obj == NULL)
381	    return NULL;
382	owned = 1;
383    }
384    if (PyUnicode_Check(obj)) {
385	Py_INCREF(obj);
386	v = obj;
387	if (encoding) {
388	    PyErr_SetString(PyExc_TypeError,
389			    "decoding Unicode is not supported");
390	    return NULL;
391	}
392	goto done;
393    }
394    else if (PyString_Check(obj)) {
395	s = PyString_AS_STRING(obj);
396	len = PyString_GET_SIZE(obj);
397    }
398    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
399	/* Overwrite the error message with something more useful in
400	   case of a TypeError. */
401	if (PyErr_ExceptionMatches(PyExc_TypeError))
402	    PyErr_Format(PyExc_TypeError,
403			 "coercing to Unicode: need string or buffer, "
404			 "%.80s found",
405			 obj->ob_type->tp_name);
406	goto onError;
407    }
408
409    /* Convert to Unicode */
410    if (len == 0) {
411	Py_INCREF(unicode_empty);
412	v = (PyObject *)unicode_empty;
413    }
414    else
415	v = PyUnicode_Decode(s, len, encoding, errors);
416 done:
417    if (owned) {
418	Py_DECREF(obj);
419    }
420    return v;
421
422 onError:
423    if (owned) {
424	Py_DECREF(obj);
425    }
426    return NULL;
427}
428
429PyObject *PyUnicode_Decode(const char *s,
430			   int size,
431			   const char *encoding,
432			   const char *errors)
433{
434    PyObject *buffer = NULL, *unicode;
435
436    if (encoding == NULL)
437	encoding = PyUnicode_GetDefaultEncoding();
438
439    /* Shortcuts for common default encodings */
440    if (strcmp(encoding, "utf-8") == 0)
441        return PyUnicode_DecodeUTF8(s, size, errors);
442    else if (strcmp(encoding, "latin-1") == 0)
443        return PyUnicode_DecodeLatin1(s, size, errors);
444    else if (strcmp(encoding, "ascii") == 0)
445        return PyUnicode_DecodeASCII(s, size, errors);
446
447    /* Decode via the codec registry */
448    buffer = PyBuffer_FromMemory((void *)s, size);
449    if (buffer == NULL)
450        goto onError;
451    unicode = PyCodec_Decode(buffer, encoding, errors);
452    if (unicode == NULL)
453        goto onError;
454    if (!PyUnicode_Check(unicode)) {
455        PyErr_Format(PyExc_TypeError,
456                     "decoder did not return an unicode object (type=%.400s)",
457                     unicode->ob_type->tp_name);
458        Py_DECREF(unicode);
459        goto onError;
460    }
461    Py_DECREF(buffer);
462    return unicode;
463
464 onError:
465    Py_XDECREF(buffer);
466    return NULL;
467}
468
469PyObject *PyUnicode_Encode(const Py_UNICODE *s,
470			   int size,
471			   const char *encoding,
472			   const char *errors)
473{
474    PyObject *v, *unicode;
475
476    unicode = PyUnicode_FromUnicode(s, size);
477    if (unicode == NULL)
478	return NULL;
479    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
480    Py_DECREF(unicode);
481    return v;
482}
483
484PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
485                                    const char *encoding,
486                                    const char *errors)
487{
488    PyObject *v;
489
490    if (!PyUnicode_Check(unicode)) {
491        PyErr_BadArgument();
492        goto onError;
493    }
494
495    if (encoding == NULL)
496	encoding = PyUnicode_GetDefaultEncoding();
497
498    /* Shortcuts for common default encodings */
499    if (errors == NULL) {
500	if (strcmp(encoding, "utf-8") == 0)
501        return PyUnicode_AsUTF8String(unicode);
502	else if (strcmp(encoding, "latin-1") == 0)
503	    return PyUnicode_AsLatin1String(unicode);
504	else if (strcmp(encoding, "ascii") == 0)
505	    return PyUnicode_AsASCIIString(unicode);
506    }
507
508    /* Encode via the codec registry */
509    v = PyCodec_Encode(unicode, encoding, errors);
510    if (v == NULL)
511        goto onError;
512    /* XXX Should we really enforce this ? */
513    if (!PyString_Check(v)) {
514        PyErr_Format(PyExc_TypeError,
515                     "encoder did not return a string object (type=%.400s)",
516                     v->ob_type->tp_name);
517        Py_DECREF(v);
518        goto onError;
519    }
520    return v;
521
522 onError:
523    return NULL;
524}
525
526/* Return a Python string holding the default encoded value of the
527   Unicode object.
528
529   The resulting string is cached in the Unicode object for subsequent
530   usage by this function. The cached version is needed to implement
531   the character buffer interface and will live (at least) as long as
532   the Unicode object itself.
533
534   The refcount of the string is *not* incremented.
535
536   *** Exported for internal use by the interpreter only !!! ***
537
538*/
539
540PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
541					    const char *errors)
542{
543    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
544
545    if (v)
546        return v;
547    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
548    if (v && errors == NULL)
549        ((PyUnicodeObject *)unicode)->defenc = v;
550    return v;
551}
552
553Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
554{
555    if (!PyUnicode_Check(unicode)) {
556        PyErr_BadArgument();
557        goto onError;
558    }
559    return PyUnicode_AS_UNICODE(unicode);
560
561 onError:
562    return NULL;
563}
564
565int PyUnicode_GetSize(PyObject *unicode)
566{
567    if (!PyUnicode_Check(unicode)) {
568        PyErr_BadArgument();
569        goto onError;
570    }
571    return PyUnicode_GET_SIZE(unicode);
572
573 onError:
574    return -1;
575}
576
577const char *PyUnicode_GetDefaultEncoding(void)
578{
579    return unicode_default_encoding;
580}
581
582int PyUnicode_SetDefaultEncoding(const char *encoding)
583{
584    PyObject *v;
585
586    /* Make sure the encoding is valid. As side effect, this also
587       loads the encoding into the codec registry cache. */
588    v = _PyCodec_Lookup(encoding);
589    if (v == NULL)
590	goto onError;
591    Py_DECREF(v);
592    strncpy(unicode_default_encoding,
593	    encoding,
594	    sizeof(unicode_default_encoding));
595    return 0;
596
597 onError:
598    return -1;
599}
600
601/* --- UTF-8 Codec -------------------------------------------------------- */
602
603static
604char utf8_code_length[256] = {
605    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
606       illegal prefix.  see RFC 2279 for details */
607    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
608    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
610    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
611    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
612    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
613    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
620    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
621    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
622    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
623};
624
625static
626int utf8_decoding_error(const char **source,
627                        Py_UNICODE **dest,
628                        const char *errors,
629                        const char *details)
630{
631    if ((errors == NULL) ||
632        (strcmp(errors,"strict") == 0)) {
633        PyErr_Format(PyExc_UnicodeError,
634                     "UTF-8 decoding error: %.400s",
635                     details);
636        return -1;
637    }
638    else if (strcmp(errors,"ignore") == 0) {
639        (*source)++;
640        return 0;
641    }
642    else if (strcmp(errors,"replace") == 0) {
643        (*source)++;
644        **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
645        (*dest)++;
646        return 0;
647    }
648    else {
649        PyErr_Format(PyExc_ValueError,
650                     "UTF-8 decoding error; unknown error handling code: %.400s",
651                     errors);
652        return -1;
653    }
654}
655
656PyObject *PyUnicode_DecodeUTF8(const char *s,
657			       int size,
658			       const char *errors)
659{
660    int n;
661    const char *e;
662    PyUnicodeObject *unicode;
663    Py_UNICODE *p;
664    const char *errmsg = "";
665
666    /* Note: size will always be longer than the resulting Unicode
667       character count */
668    unicode = _PyUnicode_New(size);
669    if (!unicode)
670        return NULL;
671    if (size == 0)
672        return (PyObject *)unicode;
673
674    /* Unpack UTF-8 encoded data */
675    p = unicode->str;
676    e = s + size;
677
678    while (s < e) {
679        Py_UCS4 ch = (unsigned char)*s;
680
681        if (ch < 0x80) {
682            *p++ = (Py_UNICODE)ch;
683            s++;
684            continue;
685        }
686
687        n = utf8_code_length[ch];
688
689        if (s + n > e) {
690	    errmsg = "unexpected end of data";
691	    goto utf8Error;
692	}
693
694        switch (n) {
695
696        case 0:
697            errmsg = "unexpected code byte";
698	    goto utf8Error;
699            break;
700
701        case 1:
702            errmsg = "internal error";
703	    goto utf8Error;
704            break;
705
706        case 2:
707            if ((s[1] & 0xc0) != 0x80) {
708                errmsg = "invalid data";
709		goto utf8Error;
710	    }
711            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
712            if (ch < 0x80) {
713                errmsg = "illegal encoding";
714		goto utf8Error;
715	    }
716	    else
717		*p++ = (Py_UNICODE)ch;
718            break;
719
720        case 3:
721            if ((s[1] & 0xc0) != 0x80 ||
722                (s[2] & 0xc0) != 0x80) {
723                errmsg = "invalid data";
724		goto utf8Error;
725	    }
726            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
727            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
728                errmsg = "illegal encoding";
729		goto utf8Error;
730	    }
731	    else
732				*p++ = (Py_UNICODE)ch;
733            break;
734
735        case 4:
736            if ((s[1] & 0xc0) != 0x80 ||
737                (s[2] & 0xc0) != 0x80 ||
738                (s[3] & 0xc0) != 0x80) {
739                errmsg = "invalid data";
740		goto utf8Error;
741	    }
742            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
743                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
744            /* validate and convert to UTF-16 */
745            if ((ch < 0x10000) ||   /* minimum value allowed for 4
746                                       byte encoding */
747                (ch > 0x10ffff)) {  /* maximum value allowed for
748                                       UTF-16 */
749                errmsg = "illegal encoding";
750		goto utf8Error;
751	    }
752            /*  compute and append the two surrogates: */
753
754            /*  translate from 10000..10FFFF to 0..FFFF */
755            ch -= 0x10000;
756
757            /*  high surrogate = top 10 bits added to D800 */
758            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
759
760            /*  low surrogate = bottom 10 bits added to DC00 */
761            *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
762            break;
763
764        default:
765            /* Other sizes are only needed for UCS-4 */
766            errmsg = "unsupported Unicode code range";
767	    goto utf8Error;
768	    break;
769        }
770        s += n;
771	continue;
772
773    utf8Error:
774      if (utf8_decoding_error(&s, &p, errors, errmsg))
775          goto onError;
776    }
777
778    /* Adjust length */
779    if (_PyUnicode_Resize(unicode, p - unicode->str))
780        goto onError;
781
782    return (PyObject *)unicode;
783
784onError:
785    Py_DECREF(unicode);
786    return NULL;
787}
788
789/* Not used anymore, now that the encoder supports UTF-16
790   surrogates. */
791#if 0
792static
793int utf8_encoding_error(const Py_UNICODE **source,
794			char **dest,
795			const char *errors,
796			const char *details)
797{
798    if ((errors == NULL) ||
799	(strcmp(errors,"strict") == 0)) {
800	PyErr_Format(PyExc_UnicodeError,
801		     "UTF-8 encoding error: %.400s",
802		     details);
803	return -1;
804    }
805    else if (strcmp(errors,"ignore") == 0) {
806	return 0;
807    }
808    else if (strcmp(errors,"replace") == 0) {
809	**dest = '?';
810	(*dest)++;
811	return 0;
812    }
813    else {
814	PyErr_Format(PyExc_ValueError,
815		     "UTF-8 encoding error; "
816		     "unknown error handling code: %.400s",
817		     errors);
818	return -1;
819    }
820}
821#endif
822
823PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
824			       int size,
825			       const char *errors)
826{
827    PyObject *v;
828    char *p;
829    char *q;
830    Py_UCS4 ch2;
831    unsigned int cbAllocated = 3 * size;
832    unsigned int cbWritten = 0;
833    int i = 0;
834
835    v = PyString_FromStringAndSize(NULL, cbAllocated);
836    if (v == NULL)
837        return NULL;
838    if (size == 0)
839        return v;
840
841    p = q = PyString_AS_STRING(v);
842    while (i < size) {
843        Py_UCS4 ch = s[i++];
844        if (ch < 0x80) {
845            *p++ = (char) ch;
846            cbWritten++;
847        }
848        else if (ch < 0x0800) {
849            *p++ = 0xc0 | (ch >> 6);
850            *p++ = 0x80 | (ch & 0x3f);
851            cbWritten += 2;
852        }
853        else {
854            /* Check for high surrogate */
855            if (0xD800 <= ch && ch <= 0xDBFF) {
856                if (i != size) {
857                    ch2 = s[i];
858                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
859
860                        if (cbWritten >= (cbAllocated - 4)) {
861			    /* Provide enough room for some more
862			       surrogates */
863			    cbAllocated += 4*10;
864                            if (_PyString_Resize(&v, cbAllocated))
865				goto onError;
866                        }
867
868                        /* combine the two values */
869                        ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
870
871                        *p++ = (char)((ch >> 18) | 0xf0);
872                        *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
873                        i++;
874                        cbWritten += 4;
875                    }
876                }
877            }
878            else {
879                *p++ = (char)(0xe0 | (ch >> 12));
880                cbWritten += 3;
881            }
882            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
883            *p++ = (char)(0x80 | (ch & 0x3f));
884        }
885    }
886    *p = '\0';
887    if (_PyString_Resize(&v, p - q))
888	goto onError;
889    return v;
890
891 onError:
892    Py_DECREF(v);
893    return NULL;
894}
895
896PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
897{
898    if (!PyUnicode_Check(unicode)) {
899        PyErr_BadArgument();
900        return NULL;
901    }
902    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
903				PyUnicode_GET_SIZE(unicode),
904				NULL);
905}
906
907/* --- UTF-16 Codec ------------------------------------------------------- */
908
909static
910int utf16_decoding_error(const Py_UNICODE **source,
911			 Py_UNICODE **dest,
912			 const char *errors,
913			 const char *details)
914{
915    if ((errors == NULL) ||
916        (strcmp(errors,"strict") == 0)) {
917        PyErr_Format(PyExc_UnicodeError,
918                     "UTF-16 decoding error: %.400s",
919                     details);
920        return -1;
921    }
922    else if (strcmp(errors,"ignore") == 0) {
923        return 0;
924    }
925    else if (strcmp(errors,"replace") == 0) {
926	if (dest) {
927	    **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
928	    (*dest)++;
929	}
930        return 0;
931    }
932    else {
933        PyErr_Format(PyExc_ValueError,
934                     "UTF-16 decoding error; "
935		     "unknown error handling code: %.400s",
936                     errors);
937        return -1;
938    }
939}
940
941PyObject *PyUnicode_DecodeUTF16(const char *s,
942				int size,
943				const char *errors,
944				int *byteorder)
945{
946    PyUnicodeObject *unicode;
947    Py_UNICODE *p;
948    const Py_UNICODE *q, *e;
949    int bo = 0;
950    const char *errmsg = "";
951
952    /* size should be an even number */
953    if (size % sizeof(Py_UNICODE) != 0) {
954	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
955	    return NULL;
956	/* The remaining input chars are ignored if we fall through
957           here... */
958    }
959
960    /* Note: size will always be longer than the resulting Unicode
961       character count */
962    unicode = _PyUnicode_New(size);
963    if (!unicode)
964        return NULL;
965    if (size == 0)
966        return (PyObject *)unicode;
967
968    /* Unpack UTF-16 encoded data */
969    p = unicode->str;
970    q = (Py_UNICODE *)s;
971    e = q + (size / sizeof(Py_UNICODE));
972
973    if (byteorder)
974	bo = *byteorder;
975
976    while (q < e) {
977	register Py_UNICODE ch = *q++;
978
979	/* Check for BOM marks (U+FEFF) in the input and adjust
980	   current byte order setting accordingly. Swap input
981	   bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
982	   !) */
983#ifdef BYTEORDER_IS_LITTLE_ENDIAN
984	if (ch == 0xFEFF) {
985	    bo = -1;
986	    continue;
987	} else if (ch == 0xFFFE) {
988	    bo = 1;
989	    continue;
990	}
991	if (bo == 1)
992	    ch = (ch >> 8) | (ch << 8);
993#else
994	if (ch == 0xFEFF) {
995	    bo = 1;
996	    continue;
997	} else if (ch == 0xFFFE) {
998	    bo = -1;
999	    continue;
1000	}
1001	if (bo == -1)
1002	    ch = (ch >> 8) | (ch << 8);
1003#endif
1004	if (ch < 0xD800 || ch > 0xDFFF) {
1005	    *p++ = ch;
1006	    continue;
1007	}
1008
1009	/* UTF-16 code pair: */
1010	if (q >= e) {
1011	    errmsg = "unexpected end of data";
1012	    goto utf16Error;
1013	}
1014	if (0xDC00 <= *q && *q <= 0xDFFF) {
1015	    q++;
1016	    if (0xD800 <= *q && *q <= 0xDBFF) {
1017		/* This is valid data (a UTF-16 surrogate pair), but
1018		   we are not able to store this information since our
1019		   Py_UNICODE type only has 16 bits... this might
1020		   change someday, even though it's unlikely. */
1021		errmsg = "code pairs are not supported";
1022		goto utf16Error;
1023	    }
1024	    else
1025		continue;
1026	}
1027	errmsg = "illegal encoding";
1028	/* Fall through to report the error */
1029
1030    utf16Error:
1031	if (utf16_decoding_error(&q, &p, errors, errmsg))
1032	    goto onError;
1033    }
1034
1035    if (byteorder)
1036        *byteorder = bo;
1037
1038    /* Adjust length */
1039    if (_PyUnicode_Resize(unicode, p - unicode->str))
1040        goto onError;
1041
1042    return (PyObject *)unicode;
1043
1044onError:
1045    Py_DECREF(unicode);
1046    return NULL;
1047}
1048
1049#undef UTF16_ERROR
1050
1051PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1052				int size,
1053				const char *errors,
1054				int byteorder)
1055{
1056    PyObject *v;
1057    Py_UNICODE *p;
1058    char *q;
1059
1060    /* We don't create UTF-16 pairs... */
1061    v = PyString_FromStringAndSize(NULL,
1062			sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1063    if (v == NULL)
1064        return NULL;
1065
1066    q = PyString_AS_STRING(v);
1067    p = (Py_UNICODE *)q;
1068    if (byteorder == 0)
1069	*p++ = 0xFEFF;
1070    if (size == 0)
1071        return v;
1072    if (byteorder == 0 ||
1073#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1074	byteorder == -1
1075#else
1076	byteorder == 1
1077#endif
1078	)
1079	memcpy(p, s, size * sizeof(Py_UNICODE));
1080    else
1081	while (size-- > 0) {
1082	    Py_UNICODE ch = *s++;
1083	    *p++ = (ch >> 8) | (ch << 8);
1084	}
1085    return v;
1086}
1087
1088PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1089{
1090    if (!PyUnicode_Check(unicode)) {
1091        PyErr_BadArgument();
1092        return NULL;
1093    }
1094    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1095				 PyUnicode_GET_SIZE(unicode),
1096				 NULL,
1097				 0);
1098}
1099
1100/* --- Unicode Escape Codec ----------------------------------------------- */
1101
1102static
1103int unicodeescape_decoding_error(const char **source,
1104                                 Py_UNICODE *x,
1105                                 const char *errors,
1106                                 const char *details)
1107{
1108    if ((errors == NULL) ||
1109        (strcmp(errors,"strict") == 0)) {
1110        PyErr_Format(PyExc_UnicodeError,
1111                     "Unicode-Escape decoding error: %.400s",
1112                     details);
1113        return -1;
1114    }
1115    else if (strcmp(errors,"ignore") == 0) {
1116        return 0;
1117    }
1118    else if (strcmp(errors,"replace") == 0) {
1119        *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1120        return 0;
1121    }
1122    else {
1123        PyErr_Format(PyExc_ValueError,
1124                     "Unicode-Escape decoding error; "
1125                     "unknown error handling code: %.400s",
1126                     errors);
1127        return -1;
1128    }
1129}
1130
1131static _Py_UCNHashAPI *pucnHash = NULL;
1132
1133static
1134int mystrnicmp(const char *s1, const char *s2, size_t count)
1135{
1136    char c1, c2;
1137
1138    if (count)
1139    {
1140        do
1141        {
1142           c1 = tolower(*(s1++));
1143           c2 = tolower(*(s2++));
1144        }
1145        while(--count && c1 == c2);
1146
1147        return c1 - c2;
1148    }
1149
1150    return 0;
1151}
1152
1153PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1154					int size,
1155					const char *errors)
1156{
1157    PyUnicodeObject *v;
1158    Py_UNICODE *p = NULL, *buf = NULL;
1159    const char *end;
1160    Py_UCS4 chr;
1161
1162    /* Escaped strings will always be longer than the resulting
1163       Unicode string, so we start with size here and then reduce the
1164       length after conversion to the true value. */
1165    v = _PyUnicode_New(size);
1166    if (v == NULL)
1167        goto onError;
1168    if (size == 0)
1169        return (PyObject *)v;
1170    p = buf = PyUnicode_AS_UNICODE(v);
1171    end = s + size;
1172    while (s < end) {
1173        unsigned char c;
1174        Py_UNICODE x;
1175        int i;
1176
1177        /* Non-escape characters are interpreted as Unicode ordinals */
1178        if (*s != '\\') {
1179            *p++ = (unsigned char)*s++;
1180            continue;
1181        }
1182
1183        /* \ - Escapes */
1184        s++;
1185        switch (*s++) {
1186
1187        /* \x escapes */
1188        case '\n': break;
1189        case '\\': *p++ = '\\'; break;
1190        case '\'': *p++ = '\''; break;
1191        case '\"': *p++ = '\"'; break;
1192        case 'b': *p++ = '\b'; break;
1193        case 'f': *p++ = '\014'; break; /* FF */
1194        case 't': *p++ = '\t'; break;
1195        case 'n': *p++ = '\n'; break;
1196        case 'r': *p++ = '\r'; break;
1197        case 'v': *p++ = '\013'; break; /* VT */
1198        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1199
1200        /* \OOO (octal) escapes */
1201        case '0': case '1': case '2': case '3':
1202        case '4': case '5': case '6': case '7':
1203            x = s[-1] - '0';
1204            if ('0' <= *s && *s <= '7') {
1205                x = (x<<3) + *s++ - '0';
1206                if ('0' <= *s && *s <= '7')
1207                    x = (x<<3) + *s++ - '0';
1208            }
1209            *p++ = x;
1210            break;
1211
1212        /* \xXX with two hex digits */
1213        case 'x':
1214            for (x = 0, i = 0; i < 2; i++) {
1215                c = (unsigned char)s[i];
1216                if (!isxdigit(c)) {
1217                    if (unicodeescape_decoding_error(&s, &x, errors,
1218                                                     "truncated \\xXX"))
1219                        goto onError;
1220                    i++;
1221                    break;
1222                }
1223                x = (x<<4) & ~0xF;
1224                if (c >= '0' && c <= '9')
1225                    x += c - '0';
1226                else if (c >= 'a' && c <= 'f')
1227                    x += 10 + c - 'a';
1228                else
1229                    x += 10 + c - 'A';
1230            }
1231            s += i;
1232            *p++ = x;
1233            break;
1234
1235        /* \uXXXX with 4 hex digits */
1236        case 'u':
1237            for (x = 0, i = 0; i < 4; i++) {
1238                c = (unsigned char)s[i];
1239                if (!isxdigit(c)) {
1240                    if (unicodeescape_decoding_error(&s, &x, errors,
1241                                                     "truncated \\uXXXX"))
1242                        goto onError;
1243                    i++;
1244                    break;
1245                }
1246                x = (x<<4) & ~0xF;
1247                if (c >= '0' && c <= '9')
1248                    x += c - '0';
1249                else if (c >= 'a' && c <= 'f')
1250                    x += 10 + c - 'a';
1251                else
1252                    x += 10 + c - 'A';
1253            }
1254            s += i;
1255            *p++ = x;
1256            break;
1257
1258        /* \UXXXXXXXX with 8 hex digits */
1259        case 'U':
1260            for (chr = 0, i = 0; i < 8; i++) {
1261                c = (unsigned char)s[i];
1262                if (!isxdigit(c)) {
1263                    if (unicodeescape_decoding_error(&s, &x, errors,
1264                                                     "truncated \\uXXXX"))
1265                        goto onError;
1266                    i++;
1267                    break;
1268                }
1269                chr = (chr<<4) & ~0xF;
1270                if (c >= '0' && c <= '9')
1271                    chr += c - '0';
1272                else if (c >= 'a' && c <= 'f')
1273                    chr += 10 + c - 'a';
1274                else
1275                    chr += 10 + c - 'A';
1276            }
1277            s += i;
1278            goto store;
1279
1280        case 'N':
1281            /* Ok, we need to deal with Unicode Character Names now,
1282             * make sure we've imported the hash table data...
1283             */
1284            if (pucnHash == NULL) {
1285                PyObject *mod = 0, *v = 0;
1286                mod = PyImport_ImportModule("ucnhash");
1287                if (mod == NULL)
1288                    goto onError;
1289                v = PyObject_GetAttrString(mod,"ucnhashAPI");
1290                Py_DECREF(mod);
1291                if (v == NULL)
1292                    goto onError;
1293                pucnHash = PyCObject_AsVoidPtr(v);
1294                Py_DECREF(v);
1295                if (pucnHash == NULL)
1296                    goto onError;
1297            }
1298
1299            if (*s == '{') {
1300                const char *start = s + 1;
1301                const char *endBrace = start;
1302                unsigned long j;
1303
1304                /* look for either the closing brace, or we
1305                 * exceed the maximum length of the unicode character names
1306                 */
1307                while (*endBrace != '}' &&
1308                       (unsigned int)(endBrace - start) <=
1309                           pucnHash->cchMax &&
1310                       endBrace < end)
1311                {
1312                    endBrace++;
1313                }
1314                if (endBrace != end && *endBrace == '}') {
1315                    j = pucnHash->hash(start, endBrace - start);
1316                    if (j > pucnHash->cKeys ||
1317                        mystrnicmp(
1318                            start,
1319                            ((_Py_UnicodeCharacterName *)
1320                             (pucnHash->getValue(j)))->pszUCN,
1321                            (int)(endBrace - start)) != 0)
1322                    {
1323                        if (unicodeescape_decoding_error(
1324                                &s, &x, errors,
1325                                "Invalid Unicode Character Name"))
1326                        {
1327                            goto onError;
1328                        }
1329                        goto ucnFallthrough;
1330                    }
1331                    chr = ((_Py_UnicodeCharacterName *)
1332                           (pucnHash->getValue(j)))->value;
1333                    s = endBrace + 1;
1334                    goto store;
1335                } else {
1336                    if (unicodeescape_decoding_error(
1337                            &s, &x, errors,
1338                            "Unicode name missing closing brace"))
1339                        goto onError;
1340                    goto ucnFallthrough;
1341                }
1342                break;
1343            }
1344            if (unicodeescape_decoding_error(
1345                    &s, &x, errors,
1346                    "Missing opening brace for Unicode Character Name escape"))
1347                goto onError;
1348ucnFallthrough:
1349            /* fall through on purpose */
1350		default:
1351            *p++ = '\\';
1352            *p++ = (unsigned char)s[-1];
1353            break;
1354store:
1355            /* when we get here, chr is a 32-bit unicode character */
1356            if (chr <= 0xffff)
1357                /* UCS-2 character */
1358                *p++ = (Py_UNICODE) chr;
1359            else if (chr <= 0x10ffff) {
1360                /* UCS-4 character.  store as two surrogate characters */
1361                chr -= 0x10000L;
1362                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1363                *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1364            } else {
1365                if (unicodeescape_decoding_error(
1366                    &s, &x, errors,
1367                    "Illegal Unicode character")
1368                    )
1369                    goto onError;
1370            }
1371        }
1372    }
1373    if (_PyUnicode_Resize(v, (int)(p - buf)))
1374		goto onError;
1375    return (PyObject *)v;
1376
1377 onError:
1378    Py_XDECREF(v);
1379    return NULL;
1380}
1381
1382/* Return a Unicode-Escape string version of the Unicode object.
1383
1384   If quotes is true, the string is enclosed in u"" or u'' quotes as
1385   appropriate.
1386
1387*/
1388
1389static const Py_UNICODE *findchar(const Py_UNICODE *s,
1390				  int size,
1391				  Py_UNICODE ch);
1392
1393static
1394PyObject *unicodeescape_string(const Py_UNICODE *s,
1395                               int size,
1396                               int quotes)
1397{
1398    PyObject *repr;
1399    char *p;
1400    char *q;
1401
1402    static const char *hexdigit = "0123456789ABCDEF";
1403
1404    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1405    if (repr == NULL)
1406        return NULL;
1407
1408    p = q = PyString_AS_STRING(repr);
1409
1410    if (quotes) {
1411        *p++ = 'u';
1412        *p++ = (findchar(s, size, '\'') &&
1413                !findchar(s, size, '"')) ? '"' : '\'';
1414    }
1415    while (size-- > 0) {
1416        Py_UNICODE ch = *s++;
1417        /* Escape quotes */
1418        if (quotes && (ch == q[1] || ch == '\\')) {
1419            *p++ = '\\';
1420            *p++ = (char) ch;
1421        }
1422        /* Map 16-bit characters to '\uxxxx' */
1423        else if (ch >= 256) {
1424            *p++ = '\\';
1425            *p++ = 'u';
1426            *p++ = hexdigit[(ch >> 12) & 0xf];
1427            *p++ = hexdigit[(ch >> 8) & 0xf];
1428            *p++ = hexdigit[(ch >> 4) & 0xf];
1429            *p++ = hexdigit[ch & 15];
1430        }
1431        /* Map non-printable US ASCII to '\ooo' */
1432        else if (ch < ' ' || ch >= 128) {
1433            *p++ = '\\';
1434            *p++ = hexdigit[(ch >> 6) & 7];
1435            *p++ = hexdigit[(ch >> 3) & 7];
1436            *p++ = hexdigit[ch & 7];
1437        }
1438        /* Copy everything else as-is */
1439        else
1440            *p++ = (char) ch;
1441    }
1442    if (quotes)
1443        *p++ = q[1];
1444
1445    *p = '\0';
1446    if (_PyString_Resize(&repr, p - q))
1447	goto onError;
1448
1449    return repr;
1450
1451 onError:
1452    Py_DECREF(repr);
1453    return NULL;
1454}
1455
1456PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1457					int size)
1458{
1459    return unicodeescape_string(s, size, 0);
1460}
1461
1462PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1463{
1464    if (!PyUnicode_Check(unicode)) {
1465        PyErr_BadArgument();
1466        return NULL;
1467    }
1468    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1469					 PyUnicode_GET_SIZE(unicode));
1470}
1471
1472/* --- Raw Unicode Escape Codec ------------------------------------------- */
1473
1474PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1475					   int size,
1476					   const char *errors)
1477{
1478    PyUnicodeObject *v;
1479    Py_UNICODE *p, *buf;
1480    const char *end;
1481    const char *bs;
1482
1483    /* Escaped strings will always be longer than the resulting
1484       Unicode string, so we start with size here and then reduce the
1485       length after conversion to the true value. */
1486    v = _PyUnicode_New(size);
1487    if (v == NULL)
1488	goto onError;
1489    if (size == 0)
1490	return (PyObject *)v;
1491    p = buf = PyUnicode_AS_UNICODE(v);
1492    end = s + size;
1493    while (s < end) {
1494	unsigned char c;
1495	Py_UNICODE x;
1496	int i;
1497
1498	/* Non-escape characters are interpreted as Unicode ordinals */
1499	if (*s != '\\') {
1500	    *p++ = (unsigned char)*s++;
1501	    continue;
1502	}
1503
1504	/* \u-escapes are only interpreted iff the number of leading
1505	   backslashes if odd */
1506	bs = s;
1507	for (;s < end;) {
1508	    if (*s != '\\')
1509		break;
1510	    *p++ = (unsigned char)*s++;
1511	}
1512	if (((s - bs) & 1) == 0 ||
1513	    s >= end ||
1514	    *s != 'u') {
1515	    continue;
1516	}
1517	p--;
1518	s++;
1519
1520	/* \uXXXX with 4 hex digits */
1521	for (x = 0, i = 0; i < 4; i++) {
1522	    c = (unsigned char)s[i];
1523	    if (!isxdigit(c)) {
1524		if (unicodeescape_decoding_error(&s, &x, errors,
1525						 "truncated \\uXXXX"))
1526		    goto onError;
1527		i++;
1528		break;
1529	    }
1530	    x = (x<<4) & ~0xF;
1531	    if (c >= '0' && c <= '9')
1532		x += c - '0';
1533	    else if (c >= 'a' && c <= 'f')
1534		x += 10 + c - 'a';
1535	    else
1536		x += 10 + c - 'A';
1537	}
1538	s += i;
1539	*p++ = x;
1540    }
1541    if (_PyUnicode_Resize(v, (int)(p - buf)))
1542	goto onError;
1543    return (PyObject *)v;
1544
1545 onError:
1546    Py_XDECREF(v);
1547    return NULL;
1548}
1549
1550PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1551					   int size)
1552{
1553    PyObject *repr;
1554    char *p;
1555    char *q;
1556
1557    static const char *hexdigit = "0123456789ABCDEF";
1558
1559    repr = PyString_FromStringAndSize(NULL, 6 * size);
1560    if (repr == NULL)
1561        return NULL;
1562    if (size == 0)
1563	return repr;
1564
1565    p = q = PyString_AS_STRING(repr);
1566    while (size-- > 0) {
1567        Py_UNICODE ch = *s++;
1568	/* Map 16-bit characters to '\uxxxx' */
1569	if (ch >= 256) {
1570            *p++ = '\\';
1571            *p++ = 'u';
1572            *p++ = hexdigit[(ch >> 12) & 0xf];
1573            *p++ = hexdigit[(ch >> 8) & 0xf];
1574            *p++ = hexdigit[(ch >> 4) & 0xf];
1575            *p++ = hexdigit[ch & 15];
1576        }
1577	/* Copy everything else as-is */
1578	else
1579            *p++ = (char) ch;
1580    }
1581    *p = '\0';
1582    if (_PyString_Resize(&repr, p - q))
1583	goto onError;
1584
1585    return repr;
1586
1587 onError:
1588    Py_DECREF(repr);
1589    return NULL;
1590}
1591
1592PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1593{
1594    if (!PyUnicode_Check(unicode)) {
1595	PyErr_BadArgument();
1596	return NULL;
1597    }
1598    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1599					    PyUnicode_GET_SIZE(unicode));
1600}
1601
1602/* --- Latin-1 Codec ------------------------------------------------------ */
1603
1604PyObject *PyUnicode_DecodeLatin1(const char *s,
1605				 int size,
1606				 const char *errors)
1607{
1608    PyUnicodeObject *v;
1609    Py_UNICODE *p;
1610
1611    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1612    v = _PyUnicode_New(size);
1613    if (v == NULL)
1614	goto onError;
1615    if (size == 0)
1616	return (PyObject *)v;
1617    p = PyUnicode_AS_UNICODE(v);
1618    while (size-- > 0)
1619	*p++ = (unsigned char)*s++;
1620    return (PyObject *)v;
1621
1622 onError:
1623    Py_XDECREF(v);
1624    return NULL;
1625}
1626
1627static
1628int latin1_encoding_error(const Py_UNICODE **source,
1629			  char **dest,
1630			  const char *errors,
1631			  const char *details)
1632{
1633    if ((errors == NULL) ||
1634	(strcmp(errors,"strict") == 0)) {
1635	PyErr_Format(PyExc_UnicodeError,
1636		     "Latin-1 encoding error: %.400s",
1637		     details);
1638	return -1;
1639    }
1640    else if (strcmp(errors,"ignore") == 0) {
1641	return 0;
1642    }
1643    else if (strcmp(errors,"replace") == 0) {
1644	**dest = '?';
1645	(*dest)++;
1646	return 0;
1647    }
1648    else {
1649	PyErr_Format(PyExc_ValueError,
1650		     "Latin-1 encoding error; "
1651		     "unknown error handling code: %.400s",
1652		     errors);
1653	return -1;
1654    }
1655}
1656
1657PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1658				 int size,
1659				 const char *errors)
1660{
1661    PyObject *repr;
1662    char *s, *start;
1663
1664    repr = PyString_FromStringAndSize(NULL, size);
1665    if (repr == NULL)
1666        return NULL;
1667    if (size == 0)
1668	return repr;
1669
1670    s = PyString_AS_STRING(repr);
1671    start = s;
1672    while (size-- > 0) {
1673        Py_UNICODE ch = *p++;
1674	if (ch >= 256) {
1675	    if (latin1_encoding_error(&p, &s, errors,
1676				      "ordinal not in range(256)"))
1677		goto onError;
1678	}
1679	else
1680            *s++ = (char)ch;
1681    }
1682    /* Resize if error handling skipped some characters */
1683    if (s - start < PyString_GET_SIZE(repr))
1684	if (_PyString_Resize(&repr, s - start))
1685	    goto onError;
1686    return repr;
1687
1688 onError:
1689    Py_DECREF(repr);
1690    return NULL;
1691}
1692
1693PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1694{
1695    if (!PyUnicode_Check(unicode)) {
1696	PyErr_BadArgument();
1697	return NULL;
1698    }
1699    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1700				  PyUnicode_GET_SIZE(unicode),
1701				  NULL);
1702}
1703
1704/* --- 7-bit ASCII Codec -------------------------------------------------- */
1705
1706static
1707int ascii_decoding_error(const char **source,
1708			 Py_UNICODE **dest,
1709			 const char *errors,
1710			 const char *details)
1711{
1712    if ((errors == NULL) ||
1713	(strcmp(errors,"strict") == 0)) {
1714	PyErr_Format(PyExc_UnicodeError,
1715		     "ASCII decoding error: %.400s",
1716		     details);
1717	return -1;
1718    }
1719    else if (strcmp(errors,"ignore") == 0) {
1720	return 0;
1721    }
1722    else if (strcmp(errors,"replace") == 0) {
1723	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1724	(*dest)++;
1725	return 0;
1726    }
1727    else {
1728	PyErr_Format(PyExc_ValueError,
1729		     "ASCII decoding error; "
1730		     "unknown error handling code: %.400s",
1731		     errors);
1732	return -1;
1733    }
1734}
1735
1736PyObject *PyUnicode_DecodeASCII(const char *s,
1737				int size,
1738				const char *errors)
1739{
1740    PyUnicodeObject *v;
1741    Py_UNICODE *p;
1742
1743    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1744    v = _PyUnicode_New(size);
1745    if (v == NULL)
1746	goto onError;
1747    if (size == 0)
1748	return (PyObject *)v;
1749    p = PyUnicode_AS_UNICODE(v);
1750    while (size-- > 0) {
1751	register unsigned char c;
1752
1753	c = (unsigned char)*s++;
1754	if (c < 128)
1755	    *p++ = c;
1756	else if (ascii_decoding_error(&s, &p, errors,
1757				      "ordinal not in range(128)"))
1758		goto onError;
1759    }
1760    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1761	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1762	    goto onError;
1763    return (PyObject *)v;
1764
1765 onError:
1766    Py_XDECREF(v);
1767    return NULL;
1768}
1769
1770static
1771int ascii_encoding_error(const Py_UNICODE **source,
1772			 char **dest,
1773			 const char *errors,
1774			 const char *details)
1775{
1776    if ((errors == NULL) ||
1777	(strcmp(errors,"strict") == 0)) {
1778	PyErr_Format(PyExc_UnicodeError,
1779		     "ASCII encoding error: %.400s",
1780		     details);
1781	return -1;
1782    }
1783    else if (strcmp(errors,"ignore") == 0) {
1784	return 0;
1785    }
1786    else if (strcmp(errors,"replace") == 0) {
1787	**dest = '?';
1788	(*dest)++;
1789	return 0;
1790    }
1791    else {
1792	PyErr_Format(PyExc_ValueError,
1793		     "ASCII encoding error; "
1794		     "unknown error handling code: %.400s",
1795		     errors);
1796	return -1;
1797    }
1798}
1799
1800PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1801				int size,
1802				const char *errors)
1803{
1804    PyObject *repr;
1805    char *s, *start;
1806
1807    repr = PyString_FromStringAndSize(NULL, size);
1808    if (repr == NULL)
1809        return NULL;
1810    if (size == 0)
1811	return repr;
1812
1813    s = PyString_AS_STRING(repr);
1814    start = s;
1815    while (size-- > 0) {
1816        Py_UNICODE ch = *p++;
1817	if (ch >= 128) {
1818	    if (ascii_encoding_error(&p, &s, errors,
1819				      "ordinal not in range(128)"))
1820		goto onError;
1821	}
1822	else
1823            *s++ = (char)ch;
1824    }
1825    /* Resize if error handling skipped some characters */
1826    if (s - start < PyString_GET_SIZE(repr))
1827	if (_PyString_Resize(&repr, s - start))
1828	    goto onError;
1829    return repr;
1830
1831 onError:
1832    Py_DECREF(repr);
1833    return NULL;
1834}
1835
1836PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1837{
1838    if (!PyUnicode_Check(unicode)) {
1839	PyErr_BadArgument();
1840	return NULL;
1841    }
1842    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1843				 PyUnicode_GET_SIZE(unicode),
1844				 NULL);
1845}
1846
1847#ifdef MS_WIN32
1848
1849/* --- MBCS codecs for Windows -------------------------------------------- */
1850
1851PyObject *PyUnicode_DecodeMBCS(const char *s,
1852				int size,
1853				const char *errors)
1854{
1855    PyUnicodeObject *v;
1856    Py_UNICODE *p;
1857
1858    /* First get the size of the result */
1859    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1860    if (size > 0 && usize==0)
1861        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1862
1863    v = _PyUnicode_New(usize);
1864    if (v == NULL)
1865        return NULL;
1866    if (usize == 0)
1867	return (PyObject *)v;
1868    p = PyUnicode_AS_UNICODE(v);
1869    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1870        Py_DECREF(v);
1871        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872    }
1873
1874    return (PyObject *)v;
1875}
1876
1877PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1878				int size,
1879				const char *errors)
1880{
1881    PyObject *repr;
1882    char *s;
1883    DWORD mbcssize;
1884
1885    /* If there are no characters, bail now! */
1886    if (size==0)
1887	    return PyString_FromString("");
1888
1889    /* First get the size of the result */
1890    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1891    if (mbcssize==0)
1892        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1893
1894    repr = PyString_FromStringAndSize(NULL, mbcssize);
1895    if (repr == NULL)
1896        return NULL;
1897    if (mbcssize == 0)
1898        return repr;
1899
1900    /* Do the conversion */
1901    s = PyString_AS_STRING(repr);
1902    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1903        Py_DECREF(repr);
1904        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1905    }
1906    return repr;
1907}
1908
1909#endif /* MS_WIN32 */
1910
1911/* --- Character Mapping Codec -------------------------------------------- */
1912
1913static
1914int charmap_decoding_error(const char **source,
1915			 Py_UNICODE **dest,
1916			 const char *errors,
1917			 const char *details)
1918{
1919    if ((errors == NULL) ||
1920	(strcmp(errors,"strict") == 0)) {
1921	PyErr_Format(PyExc_UnicodeError,
1922		     "charmap decoding error: %.400s",
1923		     details);
1924	return -1;
1925    }
1926    else if (strcmp(errors,"ignore") == 0) {
1927	return 0;
1928    }
1929    else if (strcmp(errors,"replace") == 0) {
1930	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1931	(*dest)++;
1932	return 0;
1933    }
1934    else {
1935	PyErr_Format(PyExc_ValueError,
1936		     "charmap decoding error; "
1937		     "unknown error handling code: %.400s",
1938		     errors);
1939	return -1;
1940    }
1941}
1942
1943PyObject *PyUnicode_DecodeCharmap(const char *s,
1944				  int size,
1945				  PyObject *mapping,
1946				  const char *errors)
1947{
1948    PyUnicodeObject *v;
1949    Py_UNICODE *p;
1950    int extrachars = 0;
1951
1952    /* Default to Latin-1 */
1953    if (mapping == NULL)
1954	return PyUnicode_DecodeLatin1(s, size, errors);
1955
1956    v = _PyUnicode_New(size);
1957    if (v == NULL)
1958	goto onError;
1959    if (size == 0)
1960	return (PyObject *)v;
1961    p = PyUnicode_AS_UNICODE(v);
1962    while (size-- > 0) {
1963	unsigned char ch = *s++;
1964	PyObject *w, *x;
1965
1966	/* Get mapping (char ordinal -> integer, Unicode char or None) */
1967	w = PyInt_FromLong((long)ch);
1968	if (w == NULL)
1969	    goto onError;
1970	x = PyObject_GetItem(mapping, w);
1971	Py_DECREF(w);
1972	if (x == NULL) {
1973	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1974		/* No mapping found means: mapping is undefined. */
1975		PyErr_Clear();
1976		x = Py_None;
1977		Py_INCREF(x);
1978	    } else
1979	    goto onError;
1980	}
1981
1982	/* Apply mapping */
1983	if (PyInt_Check(x)) {
1984	    long value = PyInt_AS_LONG(x);
1985	    if (value < 0 || value > 65535) {
1986		PyErr_SetString(PyExc_TypeError,
1987				"character mapping must be in range(65536)");
1988		Py_DECREF(x);
1989		goto onError;
1990	    }
1991	    *p++ = (Py_UNICODE)value;
1992	}
1993	else if (x == Py_None) {
1994	    /* undefined mapping */
1995	    if (charmap_decoding_error(&s, &p, errors,
1996				       "character maps to <undefined>")) {
1997		Py_DECREF(x);
1998		goto onError;
1999	    }
2000	}
2001	else if (PyUnicode_Check(x)) {
2002	    int targetsize = PyUnicode_GET_SIZE(x);
2003
2004	    if (targetsize == 1)
2005		/* 1-1 mapping */
2006		*p++ = *PyUnicode_AS_UNICODE(x);
2007
2008	    else if (targetsize > 1) {
2009		/* 1-n mapping */
2010		if (targetsize > extrachars) {
2011		    /* resize first */
2012		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2013		    int needed = (targetsize - extrachars) + \
2014			         (targetsize << 2);
2015		    extrachars += needed;
2016		    if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
2017		Py_DECREF(x);
2018		goto onError;
2019	    }
2020		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2021		}
2022		Py_UNICODE_COPY(p,
2023				PyUnicode_AS_UNICODE(x),
2024				targetsize);
2025		p += targetsize;
2026		extrachars -= targetsize;
2027	    }
2028	    /* 1-0 mapping: skip the character */
2029	}
2030	else {
2031	    /* wrong return value */
2032	    PyErr_SetString(PyExc_TypeError,
2033		  "character mapping must return integer, None or unicode");
2034	    Py_DECREF(x);
2035	    goto onError;
2036	}
2037	Py_DECREF(x);
2038    }
2039    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2040	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2041	    goto onError;
2042    return (PyObject *)v;
2043
2044 onError:
2045    Py_XDECREF(v);
2046    return NULL;
2047}
2048
2049static
2050int charmap_encoding_error(const Py_UNICODE **source,
2051			   char **dest,
2052			   const char *errors,
2053			   const char *details)
2054{
2055    if ((errors == NULL) ||
2056	(strcmp(errors,"strict") == 0)) {
2057	PyErr_Format(PyExc_UnicodeError,
2058		     "charmap encoding error: %.400s",
2059		     details);
2060	return -1;
2061    }
2062    else if (strcmp(errors,"ignore") == 0) {
2063	return 0;
2064    }
2065    else if (strcmp(errors,"replace") == 0) {
2066	**dest = '?';
2067	(*dest)++;
2068	return 0;
2069    }
2070    else {
2071	PyErr_Format(PyExc_ValueError,
2072		     "charmap encoding error; "
2073		     "unknown error handling code: %.400s",
2074		     errors);
2075	return -1;
2076    }
2077}
2078
2079PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2080				  int size,
2081				  PyObject *mapping,
2082				  const char *errors)
2083{
2084    PyObject *v;
2085    char *s;
2086    int extrachars = 0;
2087
2088    /* Default to Latin-1 */
2089    if (mapping == NULL)
2090	return PyUnicode_EncodeLatin1(p, size, errors);
2091
2092    v = PyString_FromStringAndSize(NULL, size);
2093    if (v == NULL)
2094        return NULL;
2095    if (size == 0)
2096	return v;
2097    s = PyString_AS_STRING(v);
2098    while (size-- > 0) {
2099	Py_UNICODE ch = *p++;
2100	PyObject *w, *x;
2101
2102	/* Get mapping (Unicode ordinal -> string char, integer or None) */
2103	w = PyInt_FromLong((long)ch);
2104	if (w == NULL)
2105	    goto onError;
2106	x = PyObject_GetItem(mapping, w);
2107	Py_DECREF(w);
2108	if (x == NULL) {
2109	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2110		/* No mapping found means: mapping is undefined. */
2111		PyErr_Clear();
2112		x = Py_None;
2113		Py_INCREF(x);
2114	    } else
2115	    goto onError;
2116	}
2117
2118	/* Apply mapping */
2119	if (PyInt_Check(x)) {
2120	    long value = PyInt_AS_LONG(x);
2121	    if (value < 0 || value > 255) {
2122		PyErr_SetString(PyExc_TypeError,
2123				"character mapping must be in range(256)");
2124		Py_DECREF(x);
2125		goto onError;
2126	    }
2127	    *s++ = (char)value;
2128	}
2129	else if (x == Py_None) {
2130	    /* undefined mapping */
2131	    if (charmap_encoding_error(&p, &s, errors,
2132				       "character maps to <undefined>")) {
2133		Py_DECREF(x);
2134		goto onError;
2135	    }
2136	}
2137	else if (PyString_Check(x)) {
2138	    int targetsize = PyString_GET_SIZE(x);
2139
2140	    if (targetsize == 1)
2141		/* 1-1 mapping */
2142		*s++ = *PyString_AS_STRING(x);
2143
2144	    else if (targetsize > 1) {
2145		/* 1-n mapping */
2146		if (targetsize > extrachars) {
2147		    /* resize first */
2148		    int oldpos = (int)(s - PyString_AS_STRING(v));
2149		    int needed = (targetsize - extrachars) + \
2150			         (targetsize << 2);
2151		    extrachars += needed;
2152		    if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2153		Py_DECREF(x);
2154		goto onError;
2155	    }
2156		    s = PyString_AS_STRING(v) + oldpos;
2157		}
2158		memcpy(s,
2159		       PyString_AS_STRING(x),
2160		       targetsize);
2161		s += targetsize;
2162		extrachars -= targetsize;
2163	    }
2164	    /* 1-0 mapping: skip the character */
2165	}
2166	else {
2167	    /* wrong return value */
2168	    PyErr_SetString(PyExc_TypeError,
2169		  "character mapping must return integer, None or unicode");
2170	    Py_DECREF(x);
2171	    goto onError;
2172	}
2173	Py_DECREF(x);
2174    }
2175    if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2176	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2177	    goto onError;
2178    return v;
2179
2180 onError:
2181    Py_DECREF(v);
2182    return NULL;
2183}
2184
2185PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2186				    PyObject *mapping)
2187{
2188    if (!PyUnicode_Check(unicode) || mapping == NULL) {
2189	PyErr_BadArgument();
2190	return NULL;
2191    }
2192    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2193				   PyUnicode_GET_SIZE(unicode),
2194				   mapping,
2195				   NULL);
2196}
2197
2198static
2199int translate_error(const Py_UNICODE **source,
2200		    Py_UNICODE **dest,
2201		    const char *errors,
2202		    const char *details)
2203{
2204    if ((errors == NULL) ||
2205	(strcmp(errors,"strict") == 0)) {
2206	PyErr_Format(PyExc_UnicodeError,
2207		     "translate error: %.400s",
2208		     details);
2209	return -1;
2210    }
2211    else if (strcmp(errors,"ignore") == 0) {
2212	return 0;
2213    }
2214    else if (strcmp(errors,"replace") == 0) {
2215	**dest = '?';
2216	(*dest)++;
2217	return 0;
2218    }
2219    else {
2220	PyErr_Format(PyExc_ValueError,
2221		     "translate error; "
2222		     "unknown error handling code: %.400s",
2223		     errors);
2224	return -1;
2225    }
2226}
2227
2228PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2229				     int size,
2230				     PyObject *mapping,
2231				     const char *errors)
2232{
2233    PyUnicodeObject *v;
2234    Py_UNICODE *p;
2235
2236    if (mapping == NULL) {
2237	PyErr_BadArgument();
2238	return NULL;
2239    }
2240
2241    /* Output will never be longer than input */
2242    v = _PyUnicode_New(size);
2243    if (v == NULL)
2244	goto onError;
2245    if (size == 0)
2246	goto done;
2247    p = PyUnicode_AS_UNICODE(v);
2248    while (size-- > 0) {
2249	Py_UNICODE ch = *s++;
2250	PyObject *w, *x;
2251
2252	/* Get mapping */
2253	w = PyInt_FromLong(ch);
2254	if (w == NULL)
2255	    goto onError;
2256	x = PyObject_GetItem(mapping, w);
2257	Py_DECREF(w);
2258	if (x == NULL) {
2259	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2260		/* No mapping found: default to 1-1 mapping */
2261		PyErr_Clear();
2262		*p++ = ch;
2263		continue;
2264	    }
2265	    goto onError;
2266	}
2267
2268	/* Apply mapping */
2269	if (PyInt_Check(x))
2270	    *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2271	else if (x == Py_None) {
2272	    /* undefined mapping */
2273	    if (translate_error(&s, &p, errors,
2274				"character maps to <undefined>")) {
2275		Py_DECREF(x);
2276		goto onError;
2277	    }
2278	}
2279	else if (PyUnicode_Check(x)) {
2280	    if (PyUnicode_GET_SIZE(x) != 1) {
2281		/* 1-n mapping */
2282		PyErr_SetString(PyExc_NotImplementedError,
2283				"1-n mappings are currently not implemented");
2284		Py_DECREF(x);
2285		goto onError;
2286	    }
2287	    *p++ = *PyUnicode_AS_UNICODE(x);
2288	}
2289	else {
2290	    /* wrong return value */
2291	    PyErr_SetString(PyExc_TypeError,
2292		  "translate mapping must return integer, None or unicode");
2293	    Py_DECREF(x);
2294	    goto onError;
2295	}
2296	Py_DECREF(x);
2297    }
2298    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2299	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2300	    goto onError;
2301
2302 done:
2303    return (PyObject *)v;
2304
2305 onError:
2306    Py_XDECREF(v);
2307    return NULL;
2308}
2309
2310PyObject *PyUnicode_Translate(PyObject *str,
2311			      PyObject *mapping,
2312			      const char *errors)
2313{
2314    PyObject *result;
2315
2316    str = PyUnicode_FromObject(str);
2317    if (str == NULL)
2318	goto onError;
2319    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2320					PyUnicode_GET_SIZE(str),
2321					mapping,
2322					errors);
2323    Py_DECREF(str);
2324    return result;
2325
2326 onError:
2327    Py_XDECREF(str);
2328    return NULL;
2329}
2330
2331/* --- Decimal Encoder ---------------------------------------------------- */
2332
2333int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2334			    int length,
2335			    char *output,
2336			    const char *errors)
2337{
2338    Py_UNICODE *p, *end;
2339
2340    if (output == NULL) {
2341	PyErr_BadArgument();
2342	return -1;
2343    }
2344
2345    p = s;
2346    end = s + length;
2347    while (p < end) {
2348	register Py_UNICODE ch = *p++;
2349	int decimal;
2350
2351	if (Py_UNICODE_ISSPACE(ch)) {
2352	    *output++ = ' ';
2353	    continue;
2354	}
2355	decimal = Py_UNICODE_TODECIMAL(ch);
2356	if (decimal >= 0) {
2357	    *output++ = '0' + decimal;
2358	    continue;
2359	}
2360	if (0 < ch && ch < 256) {
2361	    *output++ = (char)ch;
2362	    continue;
2363	}
2364	/* All other characters are considered invalid */
2365	if (errors == NULL || strcmp(errors, "strict") == 0) {
2366	    PyErr_SetString(PyExc_ValueError,
2367			    "invalid decimal Unicode string");
2368	    goto onError;
2369	}
2370	else if (strcmp(errors, "ignore") == 0)
2371	    continue;
2372	else if (strcmp(errors, "replace") == 0) {
2373	    *output++ = '?';
2374	    continue;
2375	}
2376    }
2377    /* 0-terminate the output string */
2378    *output++ = '\0';
2379    return 0;
2380
2381 onError:
2382    return -1;
2383}
2384
2385/* --- Helpers ------------------------------------------------------------ */
2386
2387static
2388int count(PyUnicodeObject *self,
2389	  int start,
2390	  int end,
2391	  PyUnicodeObject *substring)
2392{
2393    int count = 0;
2394
2395    if (substring->length == 0)
2396	return (end - start + 1);
2397
2398    end -= substring->length;
2399
2400    while (start <= end)
2401        if (Py_UNICODE_MATCH(self, start, substring)) {
2402            count++;
2403            start += substring->length;
2404        } else
2405            start++;
2406
2407    return count;
2408}
2409
2410int PyUnicode_Count(PyObject *str,
2411		    PyObject *substr,
2412		    int start,
2413		    int end)
2414{
2415    int result;
2416
2417    str = PyUnicode_FromObject(str);
2418    if (str == NULL)
2419	return -1;
2420    substr = PyUnicode_FromObject(substr);
2421    if (substr == NULL) {
2422	Py_DECREF(str);
2423	return -1;
2424    }
2425
2426    result = count((PyUnicodeObject *)str,
2427		   start, end,
2428		   (PyUnicodeObject *)substr);
2429
2430    Py_DECREF(str);
2431    Py_DECREF(substr);
2432    return result;
2433}
2434
2435static
2436int findstring(PyUnicodeObject *self,
2437	       PyUnicodeObject *substring,
2438	       int start,
2439	       int end,
2440	       int direction)
2441{
2442    if (start < 0)
2443        start += self->length;
2444    if (start < 0)
2445        start = 0;
2446
2447    if (substring->length == 0)
2448        return start;
2449
2450    if (end > self->length)
2451        end = self->length;
2452    if (end < 0)
2453        end += self->length;
2454    if (end < 0)
2455        end = 0;
2456
2457    end -= substring->length;
2458
2459    if (direction < 0) {
2460        for (; end >= start; end--)
2461            if (Py_UNICODE_MATCH(self, end, substring))
2462                return end;
2463    } else {
2464        for (; start <= end; start++)
2465            if (Py_UNICODE_MATCH(self, start, substring))
2466                return start;
2467    }
2468
2469    return -1;
2470}
2471
2472int PyUnicode_Find(PyObject *str,
2473		   PyObject *substr,
2474		   int start,
2475		   int end,
2476		   int direction)
2477{
2478    int result;
2479
2480    str = PyUnicode_FromObject(str);
2481    if (str == NULL)
2482	return -1;
2483    substr = PyUnicode_FromObject(substr);
2484    if (substr == NULL) {
2485	Py_DECREF(substr);
2486	return -1;
2487    }
2488
2489    result = findstring((PyUnicodeObject *)str,
2490			(PyUnicodeObject *)substr,
2491			start, end, direction);
2492    Py_DECREF(str);
2493    Py_DECREF(substr);
2494    return result;
2495}
2496
2497static
2498int tailmatch(PyUnicodeObject *self,
2499	      PyUnicodeObject *substring,
2500	      int start,
2501	      int end,
2502	      int direction)
2503{
2504    if (start < 0)
2505        start += self->length;
2506    if (start < 0)
2507        start = 0;
2508
2509    if (substring->length == 0)
2510        return 1;
2511
2512    if (end > self->length)
2513        end = self->length;
2514    if (end < 0)
2515        end += self->length;
2516    if (end < 0)
2517        end = 0;
2518
2519    end -= substring->length;
2520    if (end < start)
2521	return 0;
2522
2523    if (direction > 0) {
2524	if (Py_UNICODE_MATCH(self, end, substring))
2525	    return 1;
2526    } else {
2527        if (Py_UNICODE_MATCH(self, start, substring))
2528	    return 1;
2529    }
2530
2531    return 0;
2532}
2533
2534int PyUnicode_Tailmatch(PyObject *str,
2535			PyObject *substr,
2536			int start,
2537			int end,
2538			int direction)
2539{
2540    int result;
2541
2542    str = PyUnicode_FromObject(str);
2543    if (str == NULL)
2544	return -1;
2545    substr = PyUnicode_FromObject(substr);
2546    if (substr == NULL) {
2547	Py_DECREF(substr);
2548	return -1;
2549    }
2550
2551    result = tailmatch((PyUnicodeObject *)str,
2552		       (PyUnicodeObject *)substr,
2553		       start, end, direction);
2554    Py_DECREF(str);
2555    Py_DECREF(substr);
2556    return result;
2557}
2558
2559static
2560const Py_UNICODE *findchar(const Py_UNICODE *s,
2561		     int size,
2562		     Py_UNICODE ch)
2563{
2564    /* like wcschr, but doesn't stop at NULL characters */
2565
2566    while (size-- > 0) {
2567        if (*s == ch)
2568            return s;
2569        s++;
2570    }
2571
2572    return NULL;
2573}
2574
2575/* Apply fixfct filter to the Unicode object self and return a
2576   reference to the modified object */
2577
2578static
2579PyObject *fixup(PyUnicodeObject *self,
2580		int (*fixfct)(PyUnicodeObject *s))
2581{
2582
2583    PyUnicodeObject *u;
2584
2585    u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2586						 self->length);
2587    if (u == NULL)
2588	return NULL;
2589    if (!fixfct(u)) {
2590	/* fixfct should return TRUE if it modified the buffer. If
2591	   FALSE, return a reference to the original buffer instead
2592	   (to save space, not time) */
2593	Py_INCREF(self);
2594	Py_DECREF(u);
2595	return (PyObject*) self;
2596    }
2597    return (PyObject*) u;
2598}
2599
2600static
2601int fixupper(PyUnicodeObject *self)
2602{
2603    int len = self->length;
2604    Py_UNICODE *s = self->str;
2605    int status = 0;
2606
2607    while (len-- > 0) {
2608	register Py_UNICODE ch;
2609
2610	ch = Py_UNICODE_TOUPPER(*s);
2611	if (ch != *s) {
2612            status = 1;
2613	    *s = ch;
2614	}
2615        s++;
2616    }
2617
2618    return status;
2619}
2620
2621static
2622int fixlower(PyUnicodeObject *self)
2623{
2624    int len = self->length;
2625    Py_UNICODE *s = self->str;
2626    int status = 0;
2627
2628    while (len-- > 0) {
2629	register Py_UNICODE ch;
2630
2631	ch = Py_UNICODE_TOLOWER(*s);
2632	if (ch != *s) {
2633            status = 1;
2634	    *s = ch;
2635	}
2636        s++;
2637    }
2638
2639    return status;
2640}
2641
2642static
2643int fixswapcase(PyUnicodeObject *self)
2644{
2645    int len = self->length;
2646    Py_UNICODE *s = self->str;
2647    int status = 0;
2648
2649    while (len-- > 0) {
2650        if (Py_UNICODE_ISUPPER(*s)) {
2651            *s = Py_UNICODE_TOLOWER(*s);
2652            status = 1;
2653        } else if (Py_UNICODE_ISLOWER(*s)) {
2654            *s = Py_UNICODE_TOUPPER(*s);
2655            status = 1;
2656        }
2657        s++;
2658    }
2659
2660    return status;
2661}
2662
2663static
2664int fixcapitalize(PyUnicodeObject *self)
2665{
2666    if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2667	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2668	return 1;
2669    }
2670    return 0;
2671}
2672
2673static
2674int fixtitle(PyUnicodeObject *self)
2675{
2676    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2677    register Py_UNICODE *e;
2678    int previous_is_cased;
2679
2680    /* Shortcut for single character strings */
2681    if (PyUnicode_GET_SIZE(self) == 1) {
2682	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2683	if (*p != ch) {
2684	    *p = ch;
2685	    return 1;
2686	}
2687	else
2688	    return 0;
2689    }
2690
2691    e = p + PyUnicode_GET_SIZE(self);
2692    previous_is_cased = 0;
2693    for (; p < e; p++) {
2694	register const Py_UNICODE ch = *p;
2695
2696	if (previous_is_cased)
2697	    *p = Py_UNICODE_TOLOWER(ch);
2698	else
2699	    *p = Py_UNICODE_TOTITLE(ch);
2700
2701	if (Py_UNICODE_ISLOWER(ch) ||
2702	    Py_UNICODE_ISUPPER(ch) ||
2703	    Py_UNICODE_ISTITLE(ch))
2704	    previous_is_cased = 1;
2705	else
2706	    previous_is_cased = 0;
2707    }
2708    return 1;
2709}
2710
2711PyObject *PyUnicode_Join(PyObject *separator,
2712			 PyObject *seq)
2713{
2714    Py_UNICODE *sep;
2715    int seplen;
2716    PyUnicodeObject *res = NULL;
2717    int reslen = 0;
2718    Py_UNICODE *p;
2719    int seqlen = 0;
2720    int sz = 100;
2721    int i;
2722
2723    seqlen = PySequence_Size(seq);
2724    if (seqlen < 0 && PyErr_Occurred())
2725	return NULL;
2726
2727    if (separator == NULL) {
2728	Py_UNICODE blank = ' ';
2729	sep = &blank;
2730	seplen = 1;
2731    }
2732    else {
2733	separator = PyUnicode_FromObject(separator);
2734	if (separator == NULL)
2735	    return NULL;
2736	sep = PyUnicode_AS_UNICODE(separator);
2737	seplen = PyUnicode_GET_SIZE(separator);
2738    }
2739
2740    res = _PyUnicode_New(sz);
2741    if (res == NULL)
2742	goto onError;
2743    p = PyUnicode_AS_UNICODE(res);
2744    reslen = 0;
2745
2746    for (i = 0; i < seqlen; i++) {
2747	int itemlen;
2748	PyObject *item;
2749
2750	item = PySequence_GetItem(seq, i);
2751	if (item == NULL)
2752	    goto onError;
2753	if (!PyUnicode_Check(item)) {
2754	    PyObject *v;
2755	    v = PyUnicode_FromObject(item);
2756	    Py_DECREF(item);
2757	    item = v;
2758	    if (item == NULL)
2759		goto onError;
2760	}
2761	itemlen = PyUnicode_GET_SIZE(item);
2762	while (reslen + itemlen + seplen >= sz) {
2763	    if (_PyUnicode_Resize(res, sz*2))
2764		goto onError;
2765	    sz *= 2;
2766	    p = PyUnicode_AS_UNICODE(res) + reslen;
2767	}
2768	if (i > 0) {
2769	    memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2770	    p += seplen;
2771	    reslen += seplen;
2772	}
2773	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2774	p += itemlen;
2775	reslen += itemlen;
2776	Py_DECREF(item);
2777    }
2778    if (_PyUnicode_Resize(res, reslen))
2779	goto onError;
2780
2781    Py_XDECREF(separator);
2782    return (PyObject *)res;
2783
2784 onError:
2785    Py_XDECREF(separator);
2786    Py_DECREF(res);
2787    return NULL;
2788}
2789
2790static
2791PyUnicodeObject *pad(PyUnicodeObject *self,
2792		     int left,
2793		     int right,
2794		     Py_UNICODE fill)
2795{
2796    PyUnicodeObject *u;
2797
2798    if (left < 0)
2799        left = 0;
2800    if (right < 0)
2801        right = 0;
2802
2803    if (left == 0 && right == 0) {
2804        Py_INCREF(self);
2805        return self;
2806    }
2807
2808    u = _PyUnicode_New(left + self->length + right);
2809    if (u) {
2810        if (left)
2811            Py_UNICODE_FILL(u->str, fill, left);
2812        Py_UNICODE_COPY(u->str + left, self->str, self->length);
2813        if (right)
2814            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2815    }
2816
2817    return u;
2818}
2819
2820#define SPLIT_APPEND(data, left, right)					\
2821	str = PyUnicode_FromUnicode(data + left, right - left);		\
2822	if (!str)							\
2823	    goto onError;						\
2824	if (PyList_Append(list, str)) {					\
2825	    Py_DECREF(str);						\
2826	    goto onError;						\
2827	}								\
2828        else								\
2829            Py_DECREF(str);
2830
2831static
2832PyObject *split_whitespace(PyUnicodeObject *self,
2833			   PyObject *list,
2834			   int maxcount)
2835{
2836    register int i;
2837    register int j;
2838    int len = self->length;
2839    PyObject *str;
2840
2841    for (i = j = 0; i < len; ) {
2842	/* find a token */
2843	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2844	    i++;
2845	j = i;
2846	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2847	    i++;
2848	if (j < i) {
2849	    if (maxcount-- <= 0)
2850		break;
2851	    SPLIT_APPEND(self->str, j, i);
2852	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2853		i++;
2854	    j = i;
2855	}
2856    }
2857    if (j < len) {
2858	SPLIT_APPEND(self->str, j, len);
2859    }
2860    return list;
2861
2862 onError:
2863    Py_DECREF(list);
2864    return NULL;
2865}
2866
2867PyObject *PyUnicode_Splitlines(PyObject *string,
2868			       int keepends)
2869{
2870    register int i;
2871    register int j;
2872    int len;
2873    PyObject *list;
2874    PyObject *str;
2875    Py_UNICODE *data;
2876
2877    string = PyUnicode_FromObject(string);
2878    if (string == NULL)
2879	return NULL;
2880    data = PyUnicode_AS_UNICODE(string);
2881    len = PyUnicode_GET_SIZE(string);
2882
2883    list = PyList_New(0);
2884    if (!list)
2885        goto onError;
2886
2887    for (i = j = 0; i < len; ) {
2888	int eol;
2889
2890	/* Find a line and append it */
2891	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2892	    i++;
2893
2894	/* Skip the line break reading CRLF as one line break */
2895	eol = i;
2896	if (i < len) {
2897	    if (data[i] == '\r' && i + 1 < len &&
2898		data[i+1] == '\n')
2899		i += 2;
2900	    else
2901		i++;
2902	    if (keepends)
2903		eol = i;
2904	}
2905	SPLIT_APPEND(data, j, eol);
2906	j = i;
2907    }
2908    if (j < len) {
2909	SPLIT_APPEND(data, j, len);
2910    }
2911
2912    Py_DECREF(string);
2913    return list;
2914
2915 onError:
2916    Py_DECREF(list);
2917    Py_DECREF(string);
2918    return NULL;
2919}
2920
2921static
2922PyObject *split_char(PyUnicodeObject *self,
2923		     PyObject *list,
2924		     Py_UNICODE ch,
2925		     int maxcount)
2926{
2927    register int i;
2928    register int j;
2929    int len = self->length;
2930    PyObject *str;
2931
2932    for (i = j = 0; i < len; ) {
2933	if (self->str[i] == ch) {
2934	    if (maxcount-- <= 0)
2935		break;
2936	    SPLIT_APPEND(self->str, j, i);
2937	    i = j = i + 1;
2938	} else
2939	    i++;
2940    }
2941    if (j <= len) {
2942	SPLIT_APPEND(self->str, j, len);
2943    }
2944    return list;
2945
2946 onError:
2947    Py_DECREF(list);
2948    return NULL;
2949}
2950
2951static
2952PyObject *split_substring(PyUnicodeObject *self,
2953			  PyObject *list,
2954			  PyUnicodeObject *substring,
2955			  int maxcount)
2956{
2957    register int i;
2958    register int j;
2959    int len = self->length;
2960    int sublen = substring->length;
2961    PyObject *str;
2962
2963    for (i = j = 0; i <= len - sublen; ) {
2964	if (Py_UNICODE_MATCH(self, i, substring)) {
2965	    if (maxcount-- <= 0)
2966		break;
2967	    SPLIT_APPEND(self->str, j, i);
2968	    i = j = i + sublen;
2969	} else
2970	    i++;
2971    }
2972    if (j <= len) {
2973	SPLIT_APPEND(self->str, j, len);
2974    }
2975    return list;
2976
2977 onError:
2978    Py_DECREF(list);
2979    return NULL;
2980}
2981
2982#undef SPLIT_APPEND
2983
2984static
2985PyObject *split(PyUnicodeObject *self,
2986		PyUnicodeObject *substring,
2987		int maxcount)
2988{
2989    PyObject *list;
2990
2991    if (maxcount < 0)
2992        maxcount = INT_MAX;
2993
2994    list = PyList_New(0);
2995    if (!list)
2996        return NULL;
2997
2998    if (substring == NULL)
2999	return split_whitespace(self,list,maxcount);
3000
3001    else if (substring->length == 1)
3002	return split_char(self,list,substring->str[0],maxcount);
3003
3004    else if (substring->length == 0) {
3005	Py_DECREF(list);
3006	PyErr_SetString(PyExc_ValueError, "empty separator");
3007	return NULL;
3008    }
3009    else
3010	return split_substring(self,list,substring,maxcount);
3011}
3012
3013static
3014PyObject *strip(PyUnicodeObject *self,
3015		int left,
3016		int right)
3017{
3018    Py_UNICODE *p = self->str;
3019    int start = 0;
3020    int end = self->length;
3021
3022    if (left)
3023        while (start < end && Py_UNICODE_ISSPACE(p[start]))
3024            start++;
3025
3026    if (right)
3027        while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3028            end--;
3029
3030    if (start == 0 && end == self->length) {
3031        /* couldn't strip anything off, return original string */
3032        Py_INCREF(self);
3033        return (PyObject*) self;
3034    }
3035
3036    return (PyObject*) PyUnicode_FromUnicode(
3037        self->str + start,
3038        end - start
3039        );
3040}
3041
3042static
3043PyObject *replace(PyUnicodeObject *self,
3044		  PyUnicodeObject *str1,
3045		  PyUnicodeObject *str2,
3046		  int maxcount)
3047{
3048    PyUnicodeObject *u;
3049
3050    if (maxcount < 0)
3051	maxcount = INT_MAX;
3052
3053    if (str1->length == 1 && str2->length == 1) {
3054        int i;
3055
3056        /* replace characters */
3057        if (!findchar(self->str, self->length, str1->str[0])) {
3058            /* nothing to replace, return original string */
3059            Py_INCREF(self);
3060            u = self;
3061        } else {
3062	    Py_UNICODE u1 = str1->str[0];
3063	    Py_UNICODE u2 = str2->str[0];
3064
3065            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3066                self->str,
3067                self->length
3068                );
3069            if (u)
3070                for (i = 0; i < u->length; i++)
3071                    if (u->str[i] == u1) {
3072                        if (--maxcount < 0)
3073                            break;
3074                        u->str[i] = u2;
3075                    }
3076        }
3077
3078    } else {
3079        int n, i;
3080        Py_UNICODE *p;
3081
3082        /* replace strings */
3083        n = count(self, 0, self->length, str1);
3084        if (n > maxcount)
3085            n = maxcount;
3086        if (n == 0) {
3087            /* nothing to replace, return original string */
3088            Py_INCREF(self);
3089            u = self;
3090        } else {
3091            u = _PyUnicode_New(
3092                self->length + n * (str2->length - str1->length));
3093            if (u) {
3094                i = 0;
3095                p = u->str;
3096                while (i <= self->length - str1->length)
3097                    if (Py_UNICODE_MATCH(self, i, str1)) {
3098                        /* replace string segment */
3099                        Py_UNICODE_COPY(p, str2->str, str2->length);
3100                        p += str2->length;
3101                        i += str1->length;
3102                        if (--n <= 0) {
3103                            /* copy remaining part */
3104                            Py_UNICODE_COPY(p, self->str+i, self->length-i);
3105                            break;
3106                        }
3107                    } else
3108                        *p++ = self->str[i++];
3109            }
3110        }
3111    }
3112
3113    return (PyObject *) u;
3114}
3115
3116/* --- Unicode Object Methods --------------------------------------------- */
3117
3118static char title__doc__[] =
3119"S.title() -> unicode\n\
3120\n\
3121Return a titlecased version of S, i.e. words start with title case\n\
3122characters, all remaining cased characters have lower case.";
3123
3124static PyObject*
3125unicode_title(PyUnicodeObject *self, PyObject *args)
3126{
3127    if (!PyArg_NoArgs(args))
3128        return NULL;
3129    return fixup(self, fixtitle);
3130}
3131
3132static char capitalize__doc__[] =
3133"S.capitalize() -> unicode\n\
3134\n\
3135Return a capitalized version of S, i.e. make the first character\n\
3136have upper case.";
3137
3138static PyObject*
3139unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3140{
3141    if (!PyArg_NoArgs(args))
3142        return NULL;
3143    return fixup(self, fixcapitalize);
3144}
3145
3146#if 0
3147static char capwords__doc__[] =
3148"S.capwords() -> unicode\n\
3149\n\
3150Apply .capitalize() to all words in S and return the result with\n\
3151normalized whitespace (all whitespace strings are replaced by ' ').";
3152
3153static PyObject*
3154unicode_capwords(PyUnicodeObject *self, PyObject *args)
3155{
3156    PyObject *list;
3157    PyObject *item;
3158    int i;
3159
3160    if (!PyArg_NoArgs(args))
3161        return NULL;
3162
3163    /* Split into words */
3164    list = split(self, NULL, -1);
3165    if (!list)
3166        return NULL;
3167
3168    /* Capitalize each word */
3169    for (i = 0; i < PyList_GET_SIZE(list); i++) {
3170        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3171		     fixcapitalize);
3172        if (item == NULL)
3173            goto onError;
3174        Py_DECREF(PyList_GET_ITEM(list, i));
3175        PyList_SET_ITEM(list, i, item);
3176    }
3177
3178    /* Join the words to form a new string */
3179    item = PyUnicode_Join(NULL, list);
3180
3181onError:
3182    Py_DECREF(list);
3183    return (PyObject *)item;
3184}
3185#endif
3186
3187static char center__doc__[] =
3188"S.center(width) -> unicode\n\
3189\n\
3190Return S centered in a Unicode string of length width. Padding is done\n\
3191using spaces.";
3192
3193static PyObject *
3194unicode_center(PyUnicodeObject *self, PyObject *args)
3195{
3196    int marg, left;
3197    int width;
3198
3199    if (!PyArg_ParseTuple(args, "i:center", &width))
3200        return NULL;
3201
3202    if (self->length >= width) {
3203        Py_INCREF(self);
3204        return (PyObject*) self;
3205    }
3206
3207    marg = width - self->length;
3208    left = marg / 2 + (marg & width & 1);
3209
3210    return (PyObject*) pad(self, left, marg - left, ' ');
3211}
3212
3213#if 0
3214
3215/* This code should go into some future Unicode collation support
3216   module. The basic comparison should compare ordinals on a naive
3217   basis (this is what Java does and thus JPython too). */
3218
3219/* speedy UTF-16 code point order comparison */
3220/* gleaned from: */
3221/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3222
3223static short utf16Fixup[32] =
3224{
3225    0, 0, 0, 0, 0, 0, 0, 0,
3226    0, 0, 0, 0, 0, 0, 0, 0,
3227    0, 0, 0, 0, 0, 0, 0, 0,
3228    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3229};
3230
3231static int
3232unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3233{
3234    int len1, len2;
3235
3236    Py_UNICODE *s1 = str1->str;
3237    Py_UNICODE *s2 = str2->str;
3238
3239    len1 = str1->length;
3240    len2 = str2->length;
3241
3242    while (len1 > 0 && len2 > 0) {
3243        Py_UNICODE c1, c2;
3244	long diff;
3245
3246        c1 = *s1++;
3247        c2 = *s2++;
3248	if (c1 > (1<<11) * 26)
3249	    c1 += utf16Fixup[c1>>11];
3250	if (c2 > (1<<11) * 26)
3251            c2 += utf16Fixup[c2>>11];
3252
3253        /* now c1 and c2 are in UTF-32-compatible order */
3254        diff = (long)c1 - (long)c2;
3255        if (diff)
3256            return (diff < 0) ? -1 : (diff != 0);
3257        len1--; len2--;
3258    }
3259
3260    return (len1 < len2) ? -1 : (len1 != len2);
3261}
3262
3263#else
3264
3265static int
3266unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3267{
3268    register int len1, len2;
3269
3270    Py_UNICODE *s1 = str1->str;
3271    Py_UNICODE *s2 = str2->str;
3272
3273    len1 = str1->length;
3274    len2 = str2->length;
3275
3276    while (len1 > 0 && len2 > 0) {
3277	register long diff;
3278
3279        diff = (long)*s1++ - (long)*s2++;
3280        if (diff)
3281            return (diff < 0) ? -1 : (diff != 0);
3282        len1--; len2--;
3283    }
3284
3285    return (len1 < len2) ? -1 : (len1 != len2);
3286}
3287
3288#endif
3289
3290int PyUnicode_Compare(PyObject *left,
3291		      PyObject *right)
3292{
3293    PyUnicodeObject *u = NULL, *v = NULL;
3294    int result;
3295
3296    /* Coerce the two arguments */
3297    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3298    if (u == NULL)
3299	goto onError;
3300    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3301    if (v == NULL)
3302	goto onError;
3303
3304    /* Shortcut for empty or interned objects */
3305    if (v == u) {
3306	Py_DECREF(u);
3307	Py_DECREF(v);
3308	return 0;
3309    }
3310
3311    result = unicode_compare(u, v);
3312
3313    Py_DECREF(u);
3314    Py_DECREF(v);
3315    return result;
3316
3317onError:
3318    Py_XDECREF(u);
3319    Py_XDECREF(v);
3320    return -1;
3321}
3322
3323int PyUnicode_Contains(PyObject *container,
3324		       PyObject *element)
3325{
3326    PyUnicodeObject *u = NULL, *v = NULL;
3327    int result;
3328    register const Py_UNICODE *p, *e;
3329    register Py_UNICODE ch;
3330
3331    /* Coerce the two arguments */
3332    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3333    if (v == NULL) {
3334	PyErr_SetString(PyExc_TypeError,
3335	    "'in <string>' requires character as left operand");
3336	goto onError;
3337    }
3338    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3339    if (u == NULL) {
3340	Py_DECREF(v);
3341	goto onError;
3342    }
3343
3344    /* Check v in u */
3345    if (PyUnicode_GET_SIZE(v) != 1) {
3346	PyErr_SetString(PyExc_TypeError,
3347	    "'in <string>' requires character as left operand");
3348	goto onError;
3349    }
3350    ch = *PyUnicode_AS_UNICODE(v);
3351    p = PyUnicode_AS_UNICODE(u);
3352    e = p + PyUnicode_GET_SIZE(u);
3353    result = 0;
3354    while (p < e) {
3355	if (*p++ == ch) {
3356	    result = 1;
3357	    break;
3358	}
3359    }
3360
3361    Py_DECREF(u);
3362    Py_DECREF(v);
3363    return result;
3364
3365onError:
3366    Py_XDECREF(u);
3367    Py_XDECREF(v);
3368    return -1;
3369}
3370
3371/* Concat to string or Unicode object giving a new Unicode object. */
3372
3373PyObject *PyUnicode_Concat(PyObject *left,
3374			   PyObject *right)
3375{
3376    PyUnicodeObject *u = NULL, *v = NULL, *w;
3377
3378    /* Coerce the two arguments */
3379    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3380    if (u == NULL)
3381	goto onError;
3382    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3383    if (v == NULL)
3384	goto onError;
3385
3386    /* Shortcuts */
3387    if (v == unicode_empty) {
3388	Py_DECREF(v);
3389	return (PyObject *)u;
3390    }
3391    if (u == unicode_empty) {
3392	Py_DECREF(u);
3393	return (PyObject *)v;
3394    }
3395
3396    /* Concat the two Unicode strings */
3397    w = _PyUnicode_New(u->length + v->length);
3398    if (w == NULL)
3399	goto onError;
3400    Py_UNICODE_COPY(w->str, u->str, u->length);
3401    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3402
3403    Py_DECREF(u);
3404    Py_DECREF(v);
3405    return (PyObject *)w;
3406
3407onError:
3408    Py_XDECREF(u);
3409    Py_XDECREF(v);
3410    return NULL;
3411}
3412
3413static char count__doc__[] =
3414"S.count(sub[, start[, end]]) -> int\n\
3415\n\
3416Return the number of occurrences of substring sub in Unicode string\n\
3417S[start:end].  Optional arguments start and end are\n\
3418interpreted as in slice notation.";
3419
3420static PyObject *
3421unicode_count(PyUnicodeObject *self, PyObject *args)
3422{
3423    PyUnicodeObject *substring;
3424    int start = 0;
3425    int end = INT_MAX;
3426    PyObject *result;
3427
3428    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3429		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3430        return NULL;
3431
3432    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3433						(PyObject *)substring);
3434    if (substring == NULL)
3435	return NULL;
3436
3437    if (start < 0)
3438        start += self->length;
3439    if (start < 0)
3440        start = 0;
3441    if (end > self->length)
3442        end = self->length;
3443    if (end < 0)
3444        end += self->length;
3445    if (end < 0)
3446        end = 0;
3447
3448    result = PyInt_FromLong((long) count(self, start, end, substring));
3449
3450    Py_DECREF(substring);
3451    return result;
3452}
3453
3454static char encode__doc__[] =
3455"S.encode([encoding[,errors]]) -> string\n\
3456\n\
3457Return an encoded string version of S. Default encoding is the current\n\
3458default string encoding. errors may be given to set a different error\n\
3459handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3460a ValueError. Other possible values are 'ignore' and 'replace'.";
3461
3462static PyObject *
3463unicode_encode(PyUnicodeObject *self, PyObject *args)
3464{
3465    char *encoding = NULL;
3466    char *errors = NULL;
3467    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3468        return NULL;
3469    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3470}
3471
3472static char expandtabs__doc__[] =
3473"S.expandtabs([tabsize]) -> unicode\n\
3474\n\
3475Return a copy of S where all tab characters are expanded using spaces.\n\
3476If tabsize is not given, a tab size of 8 characters is assumed.";
3477
3478static PyObject*
3479unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3480{
3481    Py_UNICODE *e;
3482    Py_UNICODE *p;
3483    Py_UNICODE *q;
3484    int i, j;
3485    PyUnicodeObject *u;
3486    int tabsize = 8;
3487
3488    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3489	return NULL;
3490
3491    /* First pass: determine size of output string */
3492    i = j = 0;
3493    e = self->str + self->length;
3494    for (p = self->str; p < e; p++)
3495        if (*p == '\t') {
3496	    if (tabsize > 0)
3497		j += tabsize - (j % tabsize);
3498	}
3499        else {
3500            j++;
3501            if (*p == '\n' || *p == '\r') {
3502                i += j;
3503                j = 0;
3504            }
3505        }
3506
3507    /* Second pass: create output string and fill it */
3508    u = _PyUnicode_New(i + j);
3509    if (!u)
3510        return NULL;
3511
3512    j = 0;
3513    q = u->str;
3514
3515    for (p = self->str; p < e; p++)
3516        if (*p == '\t') {
3517	    if (tabsize > 0) {
3518		i = tabsize - (j % tabsize);
3519		j += i;
3520		while (i--)
3521		    *q++ = ' ';
3522	    }
3523	}
3524	else {
3525            j++;
3526	    *q++ = *p;
3527            if (*p == '\n' || *p == '\r')
3528                j = 0;
3529        }
3530
3531    return (PyObject*) u;
3532}
3533
3534static char find__doc__[] =
3535"S.find(sub [,start [,end]]) -> int\n\
3536\n\
3537Return the lowest index in S where substring sub is found,\n\
3538such that sub is contained within s[start,end].  Optional\n\
3539arguments start and end are interpreted as in slice notation.\n\
3540\n\
3541Return -1 on failure.";
3542
3543static PyObject *
3544unicode_find(PyUnicodeObject *self, PyObject *args)
3545{
3546    PyUnicodeObject *substring;
3547    int start = 0;
3548    int end = INT_MAX;
3549    PyObject *result;
3550
3551    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3552		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3553        return NULL;
3554    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3555						(PyObject *)substring);
3556    if (substring == NULL)
3557	return NULL;
3558
3559    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3560
3561    Py_DECREF(substring);
3562    return result;
3563}
3564
3565static PyObject *
3566unicode_getitem(PyUnicodeObject *self, int index)
3567{
3568    if (index < 0 || index >= self->length) {
3569        PyErr_SetString(PyExc_IndexError, "string index out of range");
3570        return NULL;
3571    }
3572
3573    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3574}
3575
3576static long
3577unicode_hash(PyUnicodeObject *self)
3578{
3579    /* Since Unicode objects compare equal to their ASCII string
3580       counterparts, they should use the individual character values
3581       as basis for their hash value.  This is needed to assure that
3582       strings and Unicode objects behave in the same way as
3583       dictionary keys. */
3584
3585    register int len;
3586    register Py_UNICODE *p;
3587    register long x;
3588
3589    if (self->hash != -1)
3590	return self->hash;
3591    len = PyUnicode_GET_SIZE(self);
3592    p = PyUnicode_AS_UNICODE(self);
3593    x = *p << 7;
3594    while (--len >= 0)
3595	x = (1000003*x) ^ *p++;
3596    x ^= PyUnicode_GET_SIZE(self);
3597    if (x == -1)
3598	x = -2;
3599    self->hash = x;
3600    return x;
3601}
3602
3603static char index__doc__[] =
3604"S.index(sub [,start [,end]]) -> int\n\
3605\n\
3606Like S.find() but raise ValueError when the substring is not found.";
3607
3608static PyObject *
3609unicode_index(PyUnicodeObject *self, PyObject *args)
3610{
3611    int result;
3612    PyUnicodeObject *substring;
3613    int start = 0;
3614    int end = INT_MAX;
3615
3616    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3617		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3618        return NULL;
3619
3620    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3621						(PyObject *)substring);
3622    if (substring == NULL)
3623	return NULL;
3624
3625    result = findstring(self, substring, start, end, 1);
3626
3627    Py_DECREF(substring);
3628    if (result < 0) {
3629        PyErr_SetString(PyExc_ValueError, "substring not found");
3630        return NULL;
3631    }
3632    return PyInt_FromLong(result);
3633}
3634
3635static char islower__doc__[] =
3636"S.islower() -> int\n\
3637\n\
3638Return 1 if  all cased characters in S are lowercase and there is\n\
3639at least one cased character in S, 0 otherwise.";
3640
3641static PyObject*
3642unicode_islower(PyUnicodeObject *self, PyObject *args)
3643{
3644    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3645    register const Py_UNICODE *e;
3646    int cased;
3647
3648    if (!PyArg_NoArgs(args))
3649        return NULL;
3650
3651    /* Shortcut for single character strings */
3652    if (PyUnicode_GET_SIZE(self) == 1)
3653	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3654
3655    /* Special case for empty strings */
3656    if (PyString_GET_SIZE(self) == 0)
3657	return PyInt_FromLong(0);
3658
3659    e = p + PyUnicode_GET_SIZE(self);
3660    cased = 0;
3661    for (; p < e; p++) {
3662	register const Py_UNICODE ch = *p;
3663
3664	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3665	    return PyInt_FromLong(0);
3666	else if (!cased && Py_UNICODE_ISLOWER(ch))
3667	    cased = 1;
3668    }
3669    return PyInt_FromLong(cased);
3670}
3671
3672static char isupper__doc__[] =
3673"S.isupper() -> int\n\
3674\n\
3675Return 1 if  all cased characters in S are uppercase and there is\n\
3676at least one cased character in S, 0 otherwise.";
3677
3678static PyObject*
3679unicode_isupper(PyUnicodeObject *self, PyObject *args)
3680{
3681    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3682    register const Py_UNICODE *e;
3683    int cased;
3684
3685    if (!PyArg_NoArgs(args))
3686        return NULL;
3687
3688    /* Shortcut for single character strings */
3689    if (PyUnicode_GET_SIZE(self) == 1)
3690	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3691
3692    /* Special case for empty strings */
3693    if (PyString_GET_SIZE(self) == 0)
3694	return PyInt_FromLong(0);
3695
3696    e = p + PyUnicode_GET_SIZE(self);
3697    cased = 0;
3698    for (; p < e; p++) {
3699	register const Py_UNICODE ch = *p;
3700
3701	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3702	    return PyInt_FromLong(0);
3703	else if (!cased && Py_UNICODE_ISUPPER(ch))
3704	    cased = 1;
3705    }
3706    return PyInt_FromLong(cased);
3707}
3708
3709static char istitle__doc__[] =
3710"S.istitle() -> int\n\
3711\n\
3712Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3713may only follow uncased characters and lowercase characters only cased\n\
3714ones. Return 0 otherwise.";
3715
3716static PyObject*
3717unicode_istitle(PyUnicodeObject *self, PyObject *args)
3718{
3719    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3720    register const Py_UNICODE *e;
3721    int cased, previous_is_cased;
3722
3723    if (!PyArg_NoArgs(args))
3724        return NULL;
3725
3726    /* Shortcut for single character strings */
3727    if (PyUnicode_GET_SIZE(self) == 1)
3728	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3729			      (Py_UNICODE_ISUPPER(*p) != 0));
3730
3731    /* Special case for empty strings */
3732    if (PyString_GET_SIZE(self) == 0)
3733	return PyInt_FromLong(0);
3734
3735    e = p + PyUnicode_GET_SIZE(self);
3736    cased = 0;
3737    previous_is_cased = 0;
3738    for (; p < e; p++) {
3739	register const Py_UNICODE ch = *p;
3740
3741	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3742	    if (previous_is_cased)
3743		return PyInt_FromLong(0);
3744	    previous_is_cased = 1;
3745	    cased = 1;
3746	}
3747	else if (Py_UNICODE_ISLOWER(ch)) {
3748	    if (!previous_is_cased)
3749		return PyInt_FromLong(0);
3750	    previous_is_cased = 1;
3751	    cased = 1;
3752	}
3753	else
3754	    previous_is_cased = 0;
3755    }
3756    return PyInt_FromLong(cased);
3757}
3758
3759static char isspace__doc__[] =
3760"S.isspace() -> int\n\
3761\n\
3762Return 1 if there are only whitespace characters in S,\n\
37630 otherwise.";
3764
3765static PyObject*
3766unicode_isspace(PyUnicodeObject *self, PyObject *args)
3767{
3768    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3769    register const Py_UNICODE *e;
3770
3771    if (!PyArg_NoArgs(args))
3772        return NULL;
3773
3774    /* Shortcut for single character strings */
3775    if (PyUnicode_GET_SIZE(self) == 1 &&
3776	Py_UNICODE_ISSPACE(*p))
3777	return PyInt_FromLong(1);
3778
3779    /* Special case for empty strings */
3780    if (PyString_GET_SIZE(self) == 0)
3781	return PyInt_FromLong(0);
3782
3783    e = p + PyUnicode_GET_SIZE(self);
3784    for (; p < e; p++) {
3785	if (!Py_UNICODE_ISSPACE(*p))
3786	    return PyInt_FromLong(0);
3787    }
3788    return PyInt_FromLong(1);
3789}
3790
3791static char isalpha__doc__[] =
3792"S.isalpha() -> int\n\
3793\n\
3794Return 1 if  all characters in S are alphabetic\n\
3795and there is at least one character in S, 0 otherwise.";
3796
3797static PyObject*
3798unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3799{
3800    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3801    register const Py_UNICODE *e;
3802
3803    if (!PyArg_NoArgs(args))
3804        return NULL;
3805
3806    /* Shortcut for single character strings */
3807    if (PyUnicode_GET_SIZE(self) == 1 &&
3808	Py_UNICODE_ISALPHA(*p))
3809	return PyInt_FromLong(1);
3810
3811    /* Special case for empty strings */
3812    if (PyString_GET_SIZE(self) == 0)
3813	return PyInt_FromLong(0);
3814
3815    e = p + PyUnicode_GET_SIZE(self);
3816    for (; p < e; p++) {
3817	if (!Py_UNICODE_ISALPHA(*p))
3818	    return PyInt_FromLong(0);
3819    }
3820    return PyInt_FromLong(1);
3821}
3822
3823static char isalnum__doc__[] =
3824"S.isalnum() -> int\n\
3825\n\
3826Return 1 if  all characters in S are alphanumeric\n\
3827and there is at least one character in S, 0 otherwise.";
3828
3829static PyObject*
3830unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3831{
3832    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3833    register const Py_UNICODE *e;
3834
3835    if (!PyArg_NoArgs(args))
3836        return NULL;
3837
3838    /* Shortcut for single character strings */
3839    if (PyUnicode_GET_SIZE(self) == 1 &&
3840	Py_UNICODE_ISALNUM(*p))
3841	return PyInt_FromLong(1);
3842
3843    /* Special case for empty strings */
3844    if (PyString_GET_SIZE(self) == 0)
3845	return PyInt_FromLong(0);
3846
3847    e = p + PyUnicode_GET_SIZE(self);
3848    for (; p < e; p++) {
3849	if (!Py_UNICODE_ISALNUM(*p))
3850	    return PyInt_FromLong(0);
3851    }
3852    return PyInt_FromLong(1);
3853}
3854
3855static char isdecimal__doc__[] =
3856"S.isdecimal() -> int\n\
3857\n\
3858Return 1 if there are only decimal characters in S,\n\
38590 otherwise.";
3860
3861static PyObject*
3862unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3863{
3864    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3865    register const Py_UNICODE *e;
3866
3867    if (!PyArg_NoArgs(args))
3868        return NULL;
3869
3870    /* Shortcut for single character strings */
3871    if (PyUnicode_GET_SIZE(self) == 1 &&
3872	Py_UNICODE_ISDECIMAL(*p))
3873	return PyInt_FromLong(1);
3874
3875    /* Special case for empty strings */
3876    if (PyString_GET_SIZE(self) == 0)
3877	return PyInt_FromLong(0);
3878
3879    e = p + PyUnicode_GET_SIZE(self);
3880    for (; p < e; p++) {
3881	if (!Py_UNICODE_ISDECIMAL(*p))
3882	    return PyInt_FromLong(0);
3883    }
3884    return PyInt_FromLong(1);
3885}
3886
3887static char isdigit__doc__[] =
3888"S.isdigit() -> int\n\
3889\n\
3890Return 1 if there are only digit characters in S,\n\
38910 otherwise.";
3892
3893static PyObject*
3894unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3895{
3896    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3897    register const Py_UNICODE *e;
3898
3899    if (!PyArg_NoArgs(args))
3900        return NULL;
3901
3902    /* Shortcut for single character strings */
3903    if (PyUnicode_GET_SIZE(self) == 1 &&
3904	Py_UNICODE_ISDIGIT(*p))
3905	return PyInt_FromLong(1);
3906
3907    /* Special case for empty strings */
3908    if (PyString_GET_SIZE(self) == 0)
3909	return PyInt_FromLong(0);
3910
3911    e = p + PyUnicode_GET_SIZE(self);
3912    for (; p < e; p++) {
3913	if (!Py_UNICODE_ISDIGIT(*p))
3914	    return PyInt_FromLong(0);
3915    }
3916    return PyInt_FromLong(1);
3917}
3918
3919static char isnumeric__doc__[] =
3920"S.isnumeric() -> int\n\
3921\n\
3922Return 1 if there are only numeric characters in S,\n\
39230 otherwise.";
3924
3925static PyObject*
3926unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3927{
3928    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3929    register const Py_UNICODE *e;
3930
3931    if (!PyArg_NoArgs(args))
3932        return NULL;
3933
3934    /* Shortcut for single character strings */
3935    if (PyUnicode_GET_SIZE(self) == 1 &&
3936	Py_UNICODE_ISNUMERIC(*p))
3937	return PyInt_FromLong(1);
3938
3939    /* Special case for empty strings */
3940    if (PyString_GET_SIZE(self) == 0)
3941	return PyInt_FromLong(0);
3942
3943    e = p + PyUnicode_GET_SIZE(self);
3944    for (; p < e; p++) {
3945	if (!Py_UNICODE_ISNUMERIC(*p))
3946	    return PyInt_FromLong(0);
3947    }
3948    return PyInt_FromLong(1);
3949}
3950
3951static char join__doc__[] =
3952"S.join(sequence) -> unicode\n\
3953\n\
3954Return a string which is the concatenation of the strings in the\n\
3955sequence.  The separator between elements is S.";
3956
3957static PyObject*
3958unicode_join(PyUnicodeObject *self, PyObject *args)
3959{
3960    PyObject *data;
3961    if (!PyArg_ParseTuple(args, "O:join", &data))
3962        return NULL;
3963
3964    return PyUnicode_Join((PyObject *)self, data);
3965}
3966
3967static int
3968unicode_length(PyUnicodeObject *self)
3969{
3970    return self->length;
3971}
3972
3973static char ljust__doc__[] =
3974"S.ljust(width) -> unicode\n\
3975\n\
3976Return S left justified in a Unicode string of length width. Padding is\n\
3977done using spaces.";
3978
3979static PyObject *
3980unicode_ljust(PyUnicodeObject *self, PyObject *args)
3981{
3982    int width;
3983    if (!PyArg_ParseTuple(args, "i:ljust", &width))
3984        return NULL;
3985
3986    if (self->length >= width) {
3987        Py_INCREF(self);
3988        return (PyObject*) self;
3989    }
3990
3991    return (PyObject*) pad(self, 0, width - self->length, ' ');
3992}
3993
3994static char lower__doc__[] =
3995"S.lower() -> unicode\n\
3996\n\
3997Return a copy of the string S converted to lowercase.";
3998
3999static PyObject*
4000unicode_lower(PyUnicodeObject *self, PyObject *args)
4001{
4002    if (!PyArg_NoArgs(args))
4003        return NULL;
4004    return fixup(self, fixlower);
4005}
4006
4007static char lstrip__doc__[] =
4008"S.lstrip() -> unicode\n\
4009\n\
4010Return a copy of the string S with leading whitespace removed.";
4011
4012static PyObject *
4013unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4014{
4015    if (!PyArg_NoArgs(args))
4016        return NULL;
4017    return strip(self, 1, 0);
4018}
4019
4020static PyObject*
4021unicode_repeat(PyUnicodeObject *str, int len)
4022{
4023    PyUnicodeObject *u;
4024    Py_UNICODE *p;
4025    int nchars;
4026    size_t nbytes;
4027
4028    if (len < 0)
4029        len = 0;
4030
4031    if (len == 1) {
4032        /* no repeat, return original string */
4033        Py_INCREF(str);
4034        return (PyObject*) str;
4035    }
4036
4037    /* ensure # of chars needed doesn't overflow int and # of bytes
4038     * needed doesn't overflow size_t
4039     */
4040    nchars = len * str->length;
4041    if (len && nchars / len != str->length) {
4042        PyErr_SetString(PyExc_OverflowError,
4043                        "repeated string is too long");
4044        return NULL;
4045    }
4046    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4047    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4048        PyErr_SetString(PyExc_OverflowError,
4049                        "repeated string is too long");
4050        return NULL;
4051    }
4052    u = _PyUnicode_New(nchars);
4053    if (!u)
4054        return NULL;
4055
4056    p = u->str;
4057
4058    while (len-- > 0) {
4059        Py_UNICODE_COPY(p, str->str, str->length);
4060        p += str->length;
4061    }
4062
4063    return (PyObject*) u;
4064}
4065
4066PyObject *PyUnicode_Replace(PyObject *obj,
4067			    PyObject *subobj,
4068			    PyObject *replobj,
4069			    int maxcount)
4070{
4071    PyObject *self;
4072    PyObject *str1;
4073    PyObject *str2;
4074    PyObject *result;
4075
4076    self = PyUnicode_FromObject(obj);
4077    if (self == NULL)
4078	return NULL;
4079    str1 = PyUnicode_FromObject(subobj);
4080    if (str1 == NULL) {
4081	Py_DECREF(self);
4082	return NULL;
4083    }
4084    str2 = PyUnicode_FromObject(replobj);
4085    if (str2 == NULL) {
4086	Py_DECREF(self);
4087	Py_DECREF(str1);
4088	return NULL;
4089    }
4090    result = replace((PyUnicodeObject *)self,
4091		     (PyUnicodeObject *)str1,
4092		     (PyUnicodeObject *)str2,
4093		     maxcount);
4094    Py_DECREF(self);
4095    Py_DECREF(str1);
4096    Py_DECREF(str2);
4097    return result;
4098}
4099
4100static char replace__doc__[] =
4101"S.replace (old, new[, maxsplit]) -> unicode\n\
4102\n\
4103Return a copy of S with all occurrences of substring\n\
4104old replaced by new.  If the optional argument maxsplit is\n\
4105given, only the first maxsplit occurrences are replaced.";
4106
4107static PyObject*
4108unicode_replace(PyUnicodeObject *self, PyObject *args)
4109{
4110    PyUnicodeObject *str1;
4111    PyUnicodeObject *str2;
4112    int maxcount = -1;
4113    PyObject *result;
4114
4115    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4116        return NULL;
4117    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4118    if (str1 == NULL)
4119	return NULL;
4120    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4121    if (str2 == NULL)
4122	return NULL;
4123
4124    result = replace(self, str1, str2, maxcount);
4125
4126    Py_DECREF(str1);
4127    Py_DECREF(str2);
4128    return result;
4129}
4130
4131static
4132PyObject *unicode_repr(PyObject *unicode)
4133{
4134    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4135				PyUnicode_GET_SIZE(unicode),
4136				1);
4137}
4138
4139static char rfind__doc__[] =
4140"S.rfind(sub [,start [,end]]) -> int\n\
4141\n\
4142Return the highest index in S where substring sub is found,\n\
4143such that sub is contained within s[start,end].  Optional\n\
4144arguments start and end are interpreted as in slice notation.\n\
4145\n\
4146Return -1 on failure.";
4147
4148static PyObject *
4149unicode_rfind(PyUnicodeObject *self, PyObject *args)
4150{
4151    PyUnicodeObject *substring;
4152    int start = 0;
4153    int end = INT_MAX;
4154    PyObject *result;
4155
4156    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4157		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4158        return NULL;
4159    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4160						(PyObject *)substring);
4161    if (substring == NULL)
4162	return NULL;
4163
4164    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4165
4166    Py_DECREF(substring);
4167    return result;
4168}
4169
4170static char rindex__doc__[] =
4171"S.rindex(sub [,start [,end]]) -> int\n\
4172\n\
4173Like S.rfind() but raise ValueError when the substring is not found.";
4174
4175static PyObject *
4176unicode_rindex(PyUnicodeObject *self, PyObject *args)
4177{
4178    int result;
4179    PyUnicodeObject *substring;
4180    int start = 0;
4181    int end = INT_MAX;
4182
4183    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4184		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4185        return NULL;
4186    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4187						(PyObject *)substring);
4188    if (substring == NULL)
4189	return NULL;
4190
4191    result = findstring(self, substring, start, end, -1);
4192
4193    Py_DECREF(substring);
4194    if (result < 0) {
4195        PyErr_SetString(PyExc_ValueError, "substring not found");
4196        return NULL;
4197    }
4198    return PyInt_FromLong(result);
4199}
4200
4201static char rjust__doc__[] =
4202"S.rjust(width) -> unicode\n\
4203\n\
4204Return S right justified in a Unicode string of length width. Padding is\n\
4205done using spaces.";
4206
4207static PyObject *
4208unicode_rjust(PyUnicodeObject *self, PyObject *args)
4209{
4210    int width;
4211    if (!PyArg_ParseTuple(args, "i:rjust", &width))
4212        return NULL;
4213
4214    if (self->length >= width) {
4215        Py_INCREF(self);
4216        return (PyObject*) self;
4217    }
4218
4219    return (PyObject*) pad(self, width - self->length, 0, ' ');
4220}
4221
4222static char rstrip__doc__[] =
4223"S.rstrip() -> unicode\n\
4224\n\
4225Return a copy of the string S with trailing whitespace removed.";
4226
4227static PyObject *
4228unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4229{
4230    if (!PyArg_NoArgs(args))
4231        return NULL;
4232    return strip(self, 0, 1);
4233}
4234
4235static PyObject*
4236unicode_slice(PyUnicodeObject *self, int start, int end)
4237{
4238    /* standard clamping */
4239    if (start < 0)
4240        start = 0;
4241    if (end < 0)
4242        end = 0;
4243    if (end > self->length)
4244        end = self->length;
4245    if (start == 0 && end == self->length) {
4246        /* full slice, return original string */
4247        Py_INCREF(self);
4248        return (PyObject*) self;
4249    }
4250    if (start > end)
4251        start = end;
4252    /* copy slice */
4253    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4254					     end - start);
4255}
4256
4257PyObject *PyUnicode_Split(PyObject *s,
4258			  PyObject *sep,
4259			  int maxsplit)
4260{
4261    PyObject *result;
4262
4263    s = PyUnicode_FromObject(s);
4264    if (s == NULL)
4265	return NULL;
4266    if (sep != NULL) {
4267	sep = PyUnicode_FromObject(sep);
4268	if (sep == NULL) {
4269	    Py_DECREF(s);
4270	    return NULL;
4271	}
4272    }
4273
4274    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4275
4276    Py_DECREF(s);
4277    Py_XDECREF(sep);
4278    return result;
4279}
4280
4281static char split__doc__[] =
4282"S.split([sep [,maxsplit]]) -> list of strings\n\
4283\n\
4284Return a list of the words in S, using sep as the\n\
4285delimiter string.  If maxsplit is given, at most maxsplit\n\
4286splits are done. If sep is not specified, any whitespace string\n\
4287is a separator.";
4288
4289static PyObject*
4290unicode_split(PyUnicodeObject *self, PyObject *args)
4291{
4292    PyObject *substring = Py_None;
4293    int maxcount = -1;
4294
4295    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4296        return NULL;
4297
4298    if (substring == Py_None)
4299	return split(self, NULL, maxcount);
4300    else if (PyUnicode_Check(substring))
4301	return split(self, (PyUnicodeObject *)substring, maxcount);
4302    else
4303	return PyUnicode_Split((PyObject *)self, substring, maxcount);
4304}
4305
4306static char splitlines__doc__[] =
4307"S.splitlines([keepends]]) -> list of strings\n\
4308\n\
4309Return a list of the lines in S, breaking at line boundaries.\n\
4310Line breaks are not included in the resulting list unless keepends\n\
4311is given and true.";
4312
4313static PyObject*
4314unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4315{
4316    int keepends = 0;
4317
4318    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4319        return NULL;
4320
4321    return PyUnicode_Splitlines((PyObject *)self, keepends);
4322}
4323
4324static
4325PyObject *unicode_str(PyUnicodeObject *self)
4326{
4327    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4328}
4329
4330static char strip__doc__[] =
4331"S.strip() -> unicode\n\
4332\n\
4333Return a copy of S with leading and trailing whitespace removed.";
4334
4335static PyObject *
4336unicode_strip(PyUnicodeObject *self, PyObject *args)
4337{
4338    if (!PyArg_NoArgs(args))
4339        return NULL;
4340    return strip(self, 1, 1);
4341}
4342
4343static char swapcase__doc__[] =
4344"S.swapcase() -> unicode\n\
4345\n\
4346Return a copy of S with uppercase characters converted to lowercase\n\
4347and vice versa.";
4348
4349static PyObject*
4350unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4351{
4352    if (!PyArg_NoArgs(args))
4353        return NULL;
4354    return fixup(self, fixswapcase);
4355}
4356
4357static char translate__doc__[] =
4358"S.translate(table) -> unicode\n\
4359\n\
4360Return a copy of the string S, where all characters have been mapped\n\
4361through the given translation table, which must be a mapping of\n\
4362Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4363are left untouched. Characters mapped to None are deleted.";
4364
4365static PyObject*
4366unicode_translate(PyUnicodeObject *self, PyObject *args)
4367{
4368    PyObject *table;
4369
4370    if (!PyArg_ParseTuple(args, "O:translate", &table))
4371	return NULL;
4372    return PyUnicode_TranslateCharmap(self->str,
4373				      self->length,
4374				      table,
4375				      "ignore");
4376}
4377
4378static char upper__doc__[] =
4379"S.upper() -> unicode\n\
4380\n\
4381Return a copy of S converted to uppercase.";
4382
4383static PyObject*
4384unicode_upper(PyUnicodeObject *self, PyObject *args)
4385{
4386    if (!PyArg_NoArgs(args))
4387        return NULL;
4388    return fixup(self, fixupper);
4389}
4390
4391#if 0
4392static char zfill__doc__[] =
4393"S.zfill(width) -> unicode\n\
4394\n\
4395Pad a numeric string x with zeros on the left, to fill a field\n\
4396of the specified width. The string x is never truncated.";
4397
4398static PyObject *
4399unicode_zfill(PyUnicodeObject *self, PyObject *args)
4400{
4401    int fill;
4402    PyUnicodeObject *u;
4403
4404    int width;
4405    if (!PyArg_ParseTuple(args, "i:zfill", &width))
4406        return NULL;
4407
4408    if (self->length >= width) {
4409        Py_INCREF(self);
4410        return (PyObject*) self;
4411    }
4412
4413    fill = width - self->length;
4414
4415    u = pad(self, fill, 0, '0');
4416
4417    if (u->str[fill] == '+' || u->str[fill] == '-') {
4418        /* move sign to beginning of string */
4419        u->str[0] = u->str[fill];
4420        u->str[fill] = '0';
4421    }
4422
4423    return (PyObject*) u;
4424}
4425#endif
4426
4427#if 0
4428static PyObject*
4429unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4430{
4431    if (!PyArg_NoArgs(args))
4432        return NULL;
4433    return PyInt_FromLong(unicode_freelist_size);
4434}
4435#endif
4436
4437static char startswith__doc__[] =
4438"S.startswith(prefix[, start[, end]]) -> int\n\
4439\n\
4440Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4441optional start, test S beginning at that position.  With optional end, stop\n\
4442comparing S at that position.";
4443
4444static PyObject *
4445unicode_startswith(PyUnicodeObject *self,
4446		   PyObject *args)
4447{
4448    PyUnicodeObject *substring;
4449    int start = 0;
4450    int end = INT_MAX;
4451    PyObject *result;
4452
4453    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4454		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4455	return NULL;
4456    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4457						(PyObject *)substring);
4458    if (substring == NULL)
4459	return NULL;
4460
4461    result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4462
4463    Py_DECREF(substring);
4464    return result;
4465}
4466
4467
4468static char endswith__doc__[] =
4469"S.endswith(suffix[, start[, end]]) -> int\n\
4470\n\
4471Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4472optional start, test S beginning at that position.  With optional end, stop\n\
4473comparing S at that position.";
4474
4475static PyObject *
4476unicode_endswith(PyUnicodeObject *self,
4477		 PyObject *args)
4478{
4479    PyUnicodeObject *substring;
4480    int start = 0;
4481    int end = INT_MAX;
4482    PyObject *result;
4483
4484    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4485		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4486	return NULL;
4487    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4488						(PyObject *)substring);
4489    if (substring == NULL)
4490	return NULL;
4491
4492    result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4493
4494    Py_DECREF(substring);
4495    return result;
4496}
4497
4498
4499static PyMethodDef unicode_methods[] = {
4500
4501    /* Order is according to common usage: often used methods should
4502       appear first, since lookup is done sequentially. */
4503
4504    {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4505    {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4506    {"split", (PyCFunction) unicode_split, 1, split__doc__},
4507    {"join", (PyCFunction) unicode_join, 1, join__doc__},
4508    {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4509    {"title", (PyCFunction) unicode_title, 0, title__doc__},
4510    {"center", (PyCFunction) unicode_center, 1, center__doc__},
4511    {"count", (PyCFunction) unicode_count, 1, count__doc__},
4512    {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4513    {"find", (PyCFunction) unicode_find, 1, find__doc__},
4514    {"index", (PyCFunction) unicode_index, 1, index__doc__},
4515    {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4516    {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4517    {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4518/*  {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4519    {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4520    {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4521    {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4522    {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4523    {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4524    {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4525    {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4526    {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4527    {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4528    {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4529    {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4530    {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4531    {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4532    {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4533    {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4534    {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4535    {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4536    {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4537    {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4538    {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4539#if 0
4540    {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4541    {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4542#endif
4543
4544#if 0
4545    /* This one is just used for debugging the implementation. */
4546    {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4547#endif
4548
4549    {NULL, NULL}
4550};
4551
4552static PyObject *
4553unicode_getattr(PyUnicodeObject *self, char *name)
4554{
4555    return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4556}
4557
4558static PySequenceMethods unicode_as_sequence = {
4559    (inquiry) unicode_length, 		/* sq_length */
4560    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
4561    (intargfunc) unicode_repeat, 	/* sq_repeat */
4562    (intargfunc) unicode_getitem, 	/* sq_item */
4563    (intintargfunc) unicode_slice, 	/* sq_slice */
4564    0, 					/* sq_ass_item */
4565    0, 					/* sq_ass_slice */
4566    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
4567};
4568
4569static int
4570unicode_buffer_getreadbuf(PyUnicodeObject *self,
4571			  int index,
4572			  const void **ptr)
4573{
4574    if (index != 0) {
4575        PyErr_SetString(PyExc_SystemError,
4576			"accessing non-existent unicode segment");
4577        return -1;
4578    }
4579    *ptr = (void *) self->str;
4580    return PyUnicode_GET_DATA_SIZE(self);
4581}
4582
4583static int
4584unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4585			   const void **ptr)
4586{
4587    PyErr_SetString(PyExc_TypeError,
4588		    "cannot use unicode as modifyable buffer");
4589    return -1;
4590}
4591
4592static int
4593unicode_buffer_getsegcount(PyUnicodeObject *self,
4594			   int *lenp)
4595{
4596    if (lenp)
4597        *lenp = PyUnicode_GET_DATA_SIZE(self);
4598    return 1;
4599}
4600
4601static int
4602unicode_buffer_getcharbuf(PyUnicodeObject *self,
4603			  int index,
4604			  const void **ptr)
4605{
4606    PyObject *str;
4607
4608    if (index != 0) {
4609        PyErr_SetString(PyExc_SystemError,
4610			"accessing non-existent unicode segment");
4611        return -1;
4612    }
4613    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4614    if (str == NULL)
4615	return -1;
4616    *ptr = (void *) PyString_AS_STRING(str);
4617    return PyString_GET_SIZE(str);
4618}
4619
4620/* Helpers for PyUnicode_Format() */
4621
4622static PyObject *
4623getnextarg(PyObject *args, int arglen, int *p_argidx)
4624{
4625    int argidx = *p_argidx;
4626    if (argidx < arglen) {
4627	(*p_argidx)++;
4628	if (arglen < 0)
4629	    return args;
4630	else
4631	    return PyTuple_GetItem(args, argidx);
4632    }
4633    PyErr_SetString(PyExc_TypeError,
4634		    "not enough arguments for format string");
4635    return NULL;
4636}
4637
4638#define F_LJUST (1<<0)
4639#define F_SIGN	(1<<1)
4640#define F_BLANK (1<<2)
4641#define F_ALT	(1<<3)
4642#define F_ZERO	(1<<4)
4643
4644static
4645int usprintf(register Py_UNICODE *buffer, char *format, ...)
4646{
4647    register int i;
4648    int len;
4649    va_list va;
4650    char *charbuffer;
4651    va_start(va, format);
4652
4653    /* First, format the string as char array, then expand to Py_UNICODE
4654       array. */
4655    charbuffer = (char *)buffer;
4656    len = vsprintf(charbuffer, format, va);
4657    for (i = len - 1; i >= 0; i--)
4658	buffer[i] = (Py_UNICODE) charbuffer[i];
4659
4660    va_end(va);
4661    return len;
4662}
4663
4664static int
4665formatfloat(Py_UNICODE *buf,
4666	    size_t buflen,
4667	    int flags,
4668	    int prec,
4669	    int type,
4670	    PyObject *v)
4671{
4672    /* fmt = '%#.' + `prec` + `type`
4673       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4674    char fmt[20];
4675    double x;
4676
4677    x = PyFloat_AsDouble(v);
4678    if (x == -1.0 && PyErr_Occurred())
4679	return -1;
4680    if (prec < 0)
4681	prec = 6;
4682    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4683	type = 'g';
4684    sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4685    /* worst case length calc to ensure no buffer overrun:
4686         fmt = %#.<prec>g
4687         buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4688            for any double rep.)
4689         len = 1 + prec + 1 + 2 + 5 = 9 + prec
4690       If prec=0 the effective precision is 1 (the leading digit is
4691       always given), therefore increase by one to 10+prec. */
4692    if (buflen <= (size_t)10 + (size_t)prec) {
4693	PyErr_SetString(PyExc_OverflowError,
4694	    "formatted float is too long (precision too long?)");
4695	return -1;
4696    }
4697    return usprintf(buf, fmt, x);
4698}
4699
4700static PyObject*
4701formatlong(PyObject *val, int flags, int prec, int type)
4702{
4703	char *buf;
4704	int i, len;
4705	PyObject *str; /* temporary string object. */
4706	PyUnicodeObject *result;
4707
4708	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4709	if (!str)
4710		return NULL;
4711	result = _PyUnicode_New(len);
4712	for (i = 0; i < len; i++)
4713		result->str[i] = buf[i];
4714	result->str[len] = 0;
4715	Py_DECREF(str);
4716	return (PyObject*)result;
4717}
4718
4719static int
4720formatint(Py_UNICODE *buf,
4721	  size_t buflen,
4722	  int flags,
4723	  int prec,
4724	  int type,
4725	  PyObject *v)
4726{
4727    /* fmt = '%#.' + `prec` + 'l' + `type`
4728       worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4729       + 1 + 1 = 24*/
4730    char fmt[64]; /* plenty big enough! */
4731    long x;
4732
4733    x = PyInt_AsLong(v);
4734    if (x == -1 && PyErr_Occurred())
4735	return -1;
4736    if (prec < 0)
4737	prec = 1;
4738    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4739       worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4740    if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4741        PyErr_SetString(PyExc_OverflowError,
4742            "formatted integer is too long (precision too long?)");
4743        return -1;
4744    }
4745    sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4746    return usprintf(buf, fmt, x);
4747}
4748
4749static int
4750formatchar(Py_UNICODE *buf,
4751           size_t buflen,
4752           PyObject *v)
4753{
4754    /* presume that the buffer is at least 2 characters long */
4755    if (PyUnicode_Check(v)) {
4756	if (PyUnicode_GET_SIZE(v) != 1)
4757	    goto onError;
4758	buf[0] = PyUnicode_AS_UNICODE(v)[0];
4759    }
4760
4761    else if (PyString_Check(v)) {
4762	if (PyString_GET_SIZE(v) != 1)
4763	    goto onError;
4764	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4765    }
4766
4767    else {
4768	/* Integer input truncated to a character */
4769        long x;
4770	x = PyInt_AsLong(v);
4771	if (x == -1 && PyErr_Occurred())
4772	    goto onError;
4773	buf[0] = (char) x;
4774    }
4775    buf[1] = '\0';
4776    return 1;
4777
4778 onError:
4779    PyErr_SetString(PyExc_TypeError,
4780		    "%c requires int or char");
4781    return -1;
4782}
4783
4784/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4785
4786   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4787   chars are formatted. XXX This is a magic number. Each formatting
4788   routine does bounds checking to ensure no overflow, but a better
4789   solution may be to malloc a buffer of appropriate size for each
4790   format. For now, the current solution is sufficient.
4791*/
4792#define FORMATBUFLEN (size_t)120
4793
4794PyObject *PyUnicode_Format(PyObject *format,
4795			   PyObject *args)
4796{
4797    Py_UNICODE *fmt, *res;
4798    int fmtcnt, rescnt, reslen, arglen, argidx;
4799    int args_owned = 0;
4800    PyUnicodeObject *result = NULL;
4801    PyObject *dict = NULL;
4802    PyObject *uformat;
4803
4804    if (format == NULL || args == NULL) {
4805	PyErr_BadInternalCall();
4806	return NULL;
4807    }
4808    uformat = PyUnicode_FromObject(format);
4809    if (uformat == NULL)
4810	return NULL;
4811    fmt = PyUnicode_AS_UNICODE(uformat);
4812    fmtcnt = PyUnicode_GET_SIZE(uformat);
4813
4814    reslen = rescnt = fmtcnt + 100;
4815    result = _PyUnicode_New(reslen);
4816    if (result == NULL)
4817	goto onError;
4818    res = PyUnicode_AS_UNICODE(result);
4819
4820    if (PyTuple_Check(args)) {
4821	arglen = PyTuple_Size(args);
4822	argidx = 0;
4823    }
4824    else {
4825	arglen = -1;
4826	argidx = -2;
4827    }
4828    if (args->ob_type->tp_as_mapping)
4829	dict = args;
4830
4831    while (--fmtcnt >= 0) {
4832	if (*fmt != '%') {
4833	    if (--rescnt < 0) {
4834		rescnt = fmtcnt + 100;
4835		reslen += rescnt;
4836		if (_PyUnicode_Resize(result, reslen) < 0)
4837		    return NULL;
4838		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4839		--rescnt;
4840	    }
4841	    *res++ = *fmt++;
4842	}
4843	else {
4844	    /* Got a format specifier */
4845	    int flags = 0;
4846	    int width = -1;
4847	    int prec = -1;
4848	    int size = 0;
4849	    Py_UNICODE c = '\0';
4850	    Py_UNICODE fill;
4851	    PyObject *v = NULL;
4852	    PyObject *temp = NULL;
4853	    Py_UNICODE *pbuf;
4854	    Py_UNICODE sign;
4855	    int len;
4856	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4857
4858	    fmt++;
4859	    if (*fmt == '(') {
4860		Py_UNICODE *keystart;
4861		int keylen;
4862		PyObject *key;
4863		int pcount = 1;
4864
4865		if (dict == NULL) {
4866		    PyErr_SetString(PyExc_TypeError,
4867				    "format requires a mapping");
4868		    goto onError;
4869		}
4870		++fmt;
4871		--fmtcnt;
4872		keystart = fmt;
4873		/* Skip over balanced parentheses */
4874		while (pcount > 0 && --fmtcnt >= 0) {
4875		    if (*fmt == ')')
4876			--pcount;
4877		    else if (*fmt == '(')
4878			++pcount;
4879		    fmt++;
4880		}
4881		keylen = fmt - keystart - 1;
4882		if (fmtcnt < 0 || pcount > 0) {
4883		    PyErr_SetString(PyExc_ValueError,
4884				    "incomplete format key");
4885		    goto onError;
4886		}
4887		/* keys are converted to strings using UTF-8 and
4888		   then looked up since Python uses strings to hold
4889		   variables names etc. in its namespaces and we
4890		   wouldn't want to break common idioms. */
4891		key = PyUnicode_EncodeUTF8(keystart,
4892					   keylen,
4893					   NULL);
4894		if (key == NULL)
4895		    goto onError;
4896		if (args_owned) {
4897		    Py_DECREF(args);
4898		    args_owned = 0;
4899		}
4900		args = PyObject_GetItem(dict, key);
4901		Py_DECREF(key);
4902		if (args == NULL) {
4903		    goto onError;
4904		}
4905		args_owned = 1;
4906		arglen = -1;
4907		argidx = -2;
4908	    }
4909	    while (--fmtcnt >= 0) {
4910		switch (c = *fmt++) {
4911		case '-': flags |= F_LJUST; continue;
4912		case '+': flags |= F_SIGN; continue;
4913		case ' ': flags |= F_BLANK; continue;
4914		case '#': flags |= F_ALT; continue;
4915		case '0': flags |= F_ZERO; continue;
4916		}
4917		break;
4918	    }
4919	    if (c == '*') {
4920		v = getnextarg(args, arglen, &argidx);
4921		if (v == NULL)
4922		    goto onError;
4923		if (!PyInt_Check(v)) {
4924		    PyErr_SetString(PyExc_TypeError,
4925				    "* wants int");
4926		    goto onError;
4927		}
4928		width = PyInt_AsLong(v);
4929		if (width < 0) {
4930		    flags |= F_LJUST;
4931		    width = -width;
4932		}
4933		if (--fmtcnt >= 0)
4934		    c = *fmt++;
4935	    }
4936	    else if (c >= '0' && c <= '9') {
4937		width = c - '0';
4938		while (--fmtcnt >= 0) {
4939		    c = *fmt++;
4940		    if (c < '0' || c > '9')
4941			break;
4942		    if ((width*10) / 10 != width) {
4943			PyErr_SetString(PyExc_ValueError,
4944					"width too big");
4945			goto onError;
4946		    }
4947		    width = width*10 + (c - '0');
4948		}
4949	    }
4950	    if (c == '.') {
4951		prec = 0;
4952		if (--fmtcnt >= 0)
4953		    c = *fmt++;
4954		if (c == '*') {
4955		    v = getnextarg(args, arglen, &argidx);
4956		    if (v == NULL)
4957			goto onError;
4958		    if (!PyInt_Check(v)) {
4959			PyErr_SetString(PyExc_TypeError,
4960					"* wants int");
4961			goto onError;
4962		    }
4963		    prec = PyInt_AsLong(v);
4964		    if (prec < 0)
4965			prec = 0;
4966		    if (--fmtcnt >= 0)
4967			c = *fmt++;
4968		}
4969		else if (c >= '0' && c <= '9') {
4970		    prec = c - '0';
4971		    while (--fmtcnt >= 0) {
4972			c = Py_CHARMASK(*fmt++);
4973			if (c < '0' || c > '9')
4974			    break;
4975			if ((prec*10) / 10 != prec) {
4976			    PyErr_SetString(PyExc_ValueError,
4977					    "prec too big");
4978			    goto onError;
4979			}
4980			prec = prec*10 + (c - '0');
4981		    }
4982		}
4983	    } /* prec */
4984	    if (fmtcnt >= 0) {
4985		if (c == 'h' || c == 'l' || c == 'L') {
4986		    size = c;
4987		    if (--fmtcnt >= 0)
4988			c = *fmt++;
4989		}
4990	    }
4991	    if (fmtcnt < 0) {
4992		PyErr_SetString(PyExc_ValueError,
4993				"incomplete format");
4994		goto onError;
4995	    }
4996	    if (c != '%') {
4997		v = getnextarg(args, arglen, &argidx);
4998		if (v == NULL)
4999		    goto onError;
5000	    }
5001	    sign = 0;
5002	    fill = ' ';
5003	    switch (c) {
5004
5005	    case '%':
5006		pbuf = formatbuf;
5007		/* presume that buffer length is at least 1 */
5008		pbuf[0] = '%';
5009		len = 1;
5010		break;
5011
5012	    case 's':
5013	    case 'r':
5014		if (PyUnicode_Check(v) && c == 's') {
5015		    temp = v;
5016		    Py_INCREF(temp);
5017		}
5018		else {
5019		    PyObject *unicode;
5020		    if (c == 's')
5021			temp = PyObject_Str(v);
5022		    else
5023			temp = PyObject_Repr(v);
5024		    if (temp == NULL)
5025			goto onError;
5026		    if (!PyString_Check(temp)) {
5027			/* XXX Note: this should never happen, since
5028   			       PyObject_Repr() and PyObject_Str() assure
5029			       this */
5030			Py_DECREF(temp);
5031			PyErr_SetString(PyExc_TypeError,
5032					"%s argument has non-string str()");
5033			goto onError;
5034		    }
5035		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5036						   PyString_GET_SIZE(temp),
5037					       NULL,
5038						   "strict");
5039		    Py_DECREF(temp);
5040		    temp = unicode;
5041		    if (temp == NULL)
5042			goto onError;
5043		}
5044		pbuf = PyUnicode_AS_UNICODE(temp);
5045		len = PyUnicode_GET_SIZE(temp);
5046		if (prec >= 0 && len > prec)
5047		    len = prec;
5048		break;
5049
5050	    case 'i':
5051	    case 'd':
5052	    case 'u':
5053	    case 'o':
5054	    case 'x':
5055	    case 'X':
5056		if (c == 'i')
5057		    c = 'd';
5058		if (PyLong_Check(v)) {
5059		    temp = formatlong(v, flags, prec, c);
5060		    if (!temp)
5061			goto onError;
5062		    pbuf = PyUnicode_AS_UNICODE(temp);
5063		    len = PyUnicode_GET_SIZE(temp);
5064		    /* unbounded ints can always produce
5065		       a sign character! */
5066		    sign = 1;
5067		}
5068		else {
5069		    pbuf = formatbuf;
5070		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5071				    flags, prec, c, v);
5072		    if (len < 0)
5073			goto onError;
5074		    /* only d conversion is signed */
5075		    sign = c == 'd';
5076		}
5077		if (flags & F_ZERO)
5078		    fill = '0';
5079		break;
5080
5081	    case 'e':
5082	    case 'E':
5083	    case 'f':
5084	    case 'g':
5085	    case 'G':
5086		pbuf = formatbuf;
5087		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5088			flags, prec, c, v);
5089		if (len < 0)
5090		    goto onError;
5091		sign = 1;
5092		if (flags & F_ZERO)
5093		    fill = '0';
5094		break;
5095
5096	    case 'c':
5097		pbuf = formatbuf;
5098		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5099		if (len < 0)
5100		    goto onError;
5101		break;
5102
5103	    default:
5104		PyErr_Format(PyExc_ValueError,
5105			     "unsupported format character '%c' (0x%x) "
5106			     "at index %i",
5107			     (31<=c && c<=126) ? c : '?',
5108                             c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5109		goto onError;
5110	    }
5111	    if (sign) {
5112		if (*pbuf == '-' || *pbuf == '+') {
5113		    sign = *pbuf++;
5114		    len--;
5115		}
5116		else if (flags & F_SIGN)
5117		    sign = '+';
5118		else if (flags & F_BLANK)
5119		    sign = ' ';
5120		else
5121		    sign = 0;
5122	    }
5123	    if (width < len)
5124		width = len;
5125	    if (rescnt < width + (sign != 0)) {
5126		reslen -= rescnt;
5127		rescnt = width + fmtcnt + 100;
5128		reslen += rescnt;
5129		if (_PyUnicode_Resize(result, reslen) < 0)
5130		    return NULL;
5131		res = PyUnicode_AS_UNICODE(result)
5132		    + reslen - rescnt;
5133	    }
5134	    if (sign) {
5135		if (fill != ' ')
5136		    *res++ = sign;
5137		rescnt--;
5138		if (width > len)
5139		    width--;
5140	    }
5141	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5142		assert(pbuf[0] == '0');
5143		assert(pbuf[1] == c);
5144		if (fill != ' ') {
5145		    *res++ = *pbuf++;
5146		    *res++ = *pbuf++;
5147		}
5148		rescnt -= 2;
5149		width -= 2;
5150		if (width < 0)
5151		    width = 0;
5152		len -= 2;
5153	    }
5154	    if (width > len && !(flags & F_LJUST)) {
5155		do {
5156		    --rescnt;
5157		    *res++ = fill;
5158		} while (--width > len);
5159	    }
5160	    if (fill == ' ') {
5161		if (sign)
5162		    *res++ = sign;
5163		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5164		    assert(pbuf[0] == '0');
5165		    assert(pbuf[1] == c);
5166		    *res++ = *pbuf++;
5167		    *res++ = *pbuf++;
5168		}
5169	    }
5170	    memcpy(res, pbuf, len * sizeof(Py_UNICODE));
5171	    res += len;
5172	    rescnt -= len;
5173	    while (--width >= len) {
5174		--rescnt;
5175		*res++ = ' ';
5176	    }
5177	    if (dict && (argidx < arglen) && c != '%') {
5178		PyErr_SetString(PyExc_TypeError,
5179				"not all arguments converted");
5180		goto onError;
5181	    }
5182	    Py_XDECREF(temp);
5183	} /* '%' */
5184    } /* until end */
5185    if (argidx < arglen && !dict) {
5186	PyErr_SetString(PyExc_TypeError,
5187			"not all arguments converted");
5188	goto onError;
5189    }
5190
5191    if (args_owned) {
5192	Py_DECREF(args);
5193    }
5194    Py_DECREF(uformat);
5195    if (_PyUnicode_Resize(result, reslen - rescnt))
5196	goto onError;
5197    return (PyObject *)result;
5198
5199 onError:
5200    Py_XDECREF(result);
5201    Py_DECREF(uformat);
5202    if (args_owned) {
5203	Py_DECREF(args);
5204    }
5205    return NULL;
5206}
5207
5208static PyBufferProcs unicode_as_buffer = {
5209    (getreadbufferproc) unicode_buffer_getreadbuf,
5210    (getwritebufferproc) unicode_buffer_getwritebuf,
5211    (getsegcountproc) unicode_buffer_getsegcount,
5212    (getcharbufferproc) unicode_buffer_getcharbuf,
5213};
5214
5215PyTypeObject PyUnicode_Type = {
5216    PyObject_HEAD_INIT(&PyType_Type)
5217    0, 					/* ob_size */
5218    "unicode", 				/* tp_name */
5219    sizeof(PyUnicodeObject), 		/* tp_size */
5220    0, 					/* tp_itemsize */
5221    /* Slots */
5222    (destructor)_PyUnicode_Free, 	/* tp_dealloc */
5223    0, 					/* tp_print */
5224    (getattrfunc)unicode_getattr, 	/* tp_getattr */
5225    0, 					/* tp_setattr */
5226    (cmpfunc) unicode_compare, 		/* tp_compare */
5227    (reprfunc) unicode_repr, 		/* tp_repr */
5228    0, 					/* tp_as_number */
5229    &unicode_as_sequence, 		/* tp_as_sequence */
5230    0, 					/* tp_as_mapping */
5231    (hashfunc) unicode_hash, 		/* tp_hash*/
5232    0, 					/* tp_call*/
5233    (reprfunc) unicode_str,	 	/* tp_str */
5234    (getattrofunc) NULL, 		/* tp_getattro */
5235    (setattrofunc) NULL, 		/* tp_setattro */
5236    &unicode_as_buffer,			/* tp_as_buffer */
5237    Py_TPFLAGS_DEFAULT,			/* tp_flags */
5238};
5239
5240/* Initialize the Unicode implementation */
5241
5242void _PyUnicode_Init(void)
5243{
5244    /* Doublecheck the configuration... */
5245    if (sizeof(Py_UNICODE) != 2)
5246        Py_FatalError("Unicode configuration error: "
5247		      "sizeof(Py_UNICODE) != 2 bytes");
5248
5249    /* Init the implementation */
5250    unicode_freelist = NULL;
5251    unicode_freelist_size = 0;
5252    unicode_empty = _PyUnicode_New(0);
5253    strcpy(unicode_default_encoding, "ascii");
5254}
5255
5256/* Finalize the Unicode implementation */
5257
5258void
5259_PyUnicode_Fini(void)
5260{
5261    PyUnicodeObject *u;
5262
5263    Py_XDECREF(unicode_empty);
5264    unicode_empty = NULL;
5265
5266    for (u = unicode_freelist; u != NULL;) {
5267	PyUnicodeObject *v = u;
5268	u = *(PyUnicodeObject **)u;
5269	if (v->str)
5270	    PyMem_DEL(v->str);
5271	Py_XDECREF(v->defenc);
5272	PyObject_DEL(v);
5273    }
5274    unicode_freelist = NULL;
5275    unicode_freelist_size = 0;
5276}
5277