unicodeobject.c revision dc724d6e35aaf76a291f4b17b59516adcf3c9e98
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9--------------------------------------------------------------------
10The original string type implementation is:
11
12    Copyright (c) 1999 by Secret Labs AB
13    Copyright (c) 1999 by Fredrik Lundh
14
15By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
38
39#include "Python.h"
40
41#include "unicodeobject.h"
42#include "ucnhash.h"
43
44#ifdef MS_WIN32
45#include <windows.h>
46#endif
47
48/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE       1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54   The implementation will keep allocated Unicode memory intact for
55   all objects on the free list having a size less than this
56   limit. This reduces malloc() overhead for small Unicode objects.
57
58   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60   malloc()-overhead) bytes of unused garbage.
61
62   Setting the limit to 0 effectively turns the feature off.
63
64   Note: This is an experimental feature ! If you get core dumps when
65   using Unicode objects, turn this feature off.
66
67*/
68
69#define KEEPALIVE_SIZE_LIMIT       9
70
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
79/* --- Globals ------------------------------------------------------------
80
81   The globals are initialized by the _PyUnicode_Init() API and should
82   not be used before calling that API.
83
84*/
85
86/* Free list for Unicode objects */
87static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
89
90/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94   shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
97/* Default encoding to use and assume when NULL is passed as encoding
98   parameter; it is initialized by _PyUnicode_Init().
99
100   Always use the PyUnicode_SetDefaultEncoding() and
101   PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
104static char unicode_default_encoding[100];
105
106Py_UNICODE
107PyUnicode_GetMax(void)
108{
109#ifdef Py_UNICODE_WIDE
110	return 0x10FFFF;
111#else
112	/* This is actually an illegal character, so it should
113	   not be passed to unichr. */
114	return 0xFFFF;
115#endif
116}
117
118/* --- Unicode Object ----------------------------------------------------- */
119
120static
121int unicode_resize(register PyUnicodeObject *unicode,
122                      int length)
123{
124    void *oldstr;
125
126    /* Shortcut if there's nothing much to do. */
127    if (unicode->length == length)
128	goto reset;
129
130    /* Resizing shared object (unicode_empty or single character
131       objects) in-place is not allowed. Use PyUnicode_Resize()
132       instead ! */
133    if (unicode == unicode_empty ||
134	(unicode->length == 1 &&
135	 unicode->str[0] < 256 &&
136	 unicode_latin1[unicode->str[0]] == unicode)) {
137        PyErr_SetString(PyExc_SystemError,
138                        "can't resize shared unicode objects");
139        return -1;
140    }
141
142    /* We allocate one more byte to make sure the string is
143       Ux0000 terminated -- XXX is this needed ? */
144    oldstr = unicode->str;
145    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146    if (!unicode->str) {
147	unicode->str = oldstr;
148        PyErr_NoMemory();
149        return -1;
150    }
151    unicode->str[length] = 0;
152    unicode->length = length;
153
154 reset:
155    /* Reset the object caches */
156    if (unicode->defenc) {
157        Py_DECREF(unicode->defenc);
158        unicode->defenc = NULL;
159    }
160    unicode->hash = -1;
161
162    return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166   Ux0000 terminated -- XXX is this needed ?
167
168   XXX This allocator could further be enhanced by assuring that the
169       free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176    register PyUnicodeObject *unicode;
177
178    /* Optimization for empty strings */
179    if (length == 0 && unicode_empty != NULL) {
180        Py_INCREF(unicode_empty);
181        return unicode_empty;
182    }
183
184    /* Unicode freelist & memory allocation */
185    if (unicode_freelist) {
186        unicode = unicode_freelist;
187        unicode_freelist = *(PyUnicodeObject **)unicode;
188        unicode_freelist_size--;
189	if (unicode->str) {
190	    /* Keep-Alive optimization: we only upsize the buffer,
191	       never downsize it. */
192	    if ((unicode->length < length) &&
193		unicode_resize(unicode, length)) {
194		PyMem_DEL(unicode->str);
195		goto onError;
196	    }
197	}
198        else {
199	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200        }
201        PyObject_INIT(unicode, &PyUnicode_Type);
202    }
203    else {
204        unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205        if (unicode == NULL)
206            return NULL;
207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208    }
209
210    if (!unicode->str) {
211	PyErr_NoMemory();
212	goto onError;
213    }
214    unicode->str[length] = 0;
215    unicode->length = length;
216    unicode->hash = -1;
217    unicode->defenc = NULL;
218    return unicode;
219
220 onError:
221    _Py_ForgetReference((PyObject *)unicode);
222    PyObject_DEL(unicode);
223    return NULL;
224}
225
226static
227void unicode_dealloc(register PyUnicodeObject *unicode)
228{
229    if (PyUnicode_CheckExact(unicode) &&
230	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
231        /* Keep-Alive optimization */
232	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
233	    PyMem_DEL(unicode->str);
234	    unicode->str = NULL;
235	    unicode->length = 0;
236	}
237	if (unicode->defenc) {
238	    Py_DECREF(unicode->defenc);
239	    unicode->defenc = NULL;
240	}
241	/* Add to free list */
242        *(PyUnicodeObject **)unicode = unicode_freelist;
243        unicode_freelist = unicode;
244        unicode_freelist_size++;
245    }
246    else {
247	PyMem_DEL(unicode->str);
248	Py_XDECREF(unicode->defenc);
249	unicode->ob_type->tp_free((PyObject *)unicode);
250    }
251}
252
253int PyUnicode_Resize(PyObject **unicode,
254		     int length)
255{
256    register PyUnicodeObject *v;
257
258    /* Argument checks */
259    if (unicode == NULL) {
260	PyErr_BadInternalCall();
261	return -1;
262    }
263    v = (PyUnicodeObject *)*unicode;
264    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265	PyErr_BadInternalCall();
266	return -1;
267    }
268
269    /* Resizing unicode_empty and single character objects is not
270       possible since these are being shared. We simply return a fresh
271       copy with the same Unicode content. */
272    if (v->length != length &&
273	(v == unicode_empty || v->length == 1)) {
274	PyUnicodeObject *w = _PyUnicode_New(length);
275	if (w == NULL)
276	    return -1;
277	Py_UNICODE_COPY(w->str, v->str,
278			length < v->length ? length : v->length);
279	*unicode = (PyObject *)w;
280	return 0;
281    }
282
283    /* Note that we don't have to modify *unicode for unshared Unicode
284       objects, since we can modify them in-place. */
285    return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293				int size)
294{
295    PyUnicodeObject *unicode;
296
297    /* If the Unicode data is known at construction time, we can apply
298       some optimizations which share commonly used objects. */
299    if (u != NULL) {
300
301	/* Optimization for empty strings */
302	if (size == 0 && unicode_empty != NULL) {
303	    Py_INCREF(unicode_empty);
304	    return (PyObject *)unicode_empty;
305	}
306
307	/* Single character Unicode objects in the Latin-1 range are
308	   shared when using this constructor */
309	if (size == 1 && *u < 256) {
310	    unicode = unicode_latin1[*u];
311	    if (!unicode) {
312		unicode = _PyUnicode_New(1);
313		if (!unicode)
314		    return NULL;
315		unicode->str[0] = *u;
316		unicode_latin1[*u] = unicode;
317	    }
318	    Py_INCREF(unicode);
319	    return (PyObject *)unicode;
320	}
321    }
322
323    unicode = _PyUnicode_New(size);
324    if (!unicode)
325        return NULL;
326
327    /* Copy the Unicode data into the new object */
328    if (u != NULL)
329	Py_UNICODE_COPY(unicode->str, u, size);
330
331    return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337				 int size)
338{
339    PyUnicodeObject *unicode;
340
341    if (w == NULL) {
342	PyErr_BadInternalCall();
343	return NULL;
344    }
345
346    unicode = _PyUnicode_New(size);
347    if (!unicode)
348        return NULL;
349
350    /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352    memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354    {
355	register Py_UNICODE *u;
356	register int i;
357	u = PyUnicode_AS_UNICODE(unicode);
358	for (i = size; i >= 0; i--)
359	    *u++ = *w++;
360    }
361#endif
362
363    return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367			 register wchar_t *w,
368			 int size)
369{
370    if (unicode == NULL) {
371	PyErr_BadInternalCall();
372	return -1;
373    }
374    if (size > PyUnicode_GET_SIZE(unicode))
375	size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377    memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379    {
380	register Py_UNICODE *u;
381	register int i;
382	u = PyUnicode_AS_UNICODE(unicode);
383	for (i = size; i >= 0; i--)
384	    *w++ = *u++;
385    }
386#endif
387
388    return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
395    /* XXX Perhaps we should make this API an alias of
396           PyObject_Unicode() instead ?! */
397    if (PyUnicode_CheckExact(obj)) {
398	Py_INCREF(obj);
399	return obj;
400    }
401    if (PyUnicode_Check(obj)) {
402	/* For a Unicode subtype that's not a Unicode object,
403	   return a true Unicode object with the same data. */
404	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405				     PyUnicode_GET_SIZE(obj));
406    }
407    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411				      const char *encoding,
412				      const char *errors)
413{
414    const char *s = NULL;
415    int len;
416    int owned = 0;
417    PyObject *v;
418
419    if (obj == NULL) {
420	PyErr_BadInternalCall();
421	return NULL;
422    }
423
424#if 0
425    /* For b/w compatibility we also accept Unicode objects provided
426       that no encodings is given and then redirect to
427       PyObject_Unicode() which then applies the additional logic for
428       Unicode subclasses.
429
430       NOTE: This API should really only be used for object which
431             represent *encoded* Unicode !
432
433    */
434	if (PyUnicode_Check(obj)) {
435	    if (encoding) {
436		PyErr_SetString(PyExc_TypeError,
437				"decoding Unicode is not supported");
438	    return NULL;
439	    }
440	return PyObject_Unicode(obj);
441	    }
442#else
443    if (PyUnicode_Check(obj)) {
444	PyErr_SetString(PyExc_TypeError,
445			"decoding Unicode is not supported");
446	return NULL;
447	}
448#endif
449
450    /* Coerce object */
451    if (PyString_Check(obj)) {
452	    s = PyString_AS_STRING(obj);
453	    len = PyString_GET_SIZE(obj);
454	    }
455    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456	/* Overwrite the error message with something more useful in
457	   case of a TypeError. */
458	if (PyErr_ExceptionMatches(PyExc_TypeError))
459	PyErr_Format(PyExc_TypeError,
460			 "coercing to Unicode: need string or buffer, "
461			 "%.80s found",
462		     obj->ob_type->tp_name);
463	goto onError;
464    }
465
466    /* Convert to Unicode */
467    if (len == 0) {
468	Py_INCREF(unicode_empty);
469	v = (PyObject *)unicode_empty;
470    }
471    else
472	v = PyUnicode_Decode(s, len, encoding, errors);
473
474    if (owned) {
475	Py_DECREF(obj);
476    }
477    return v;
478
479 onError:
480    if (owned) {
481	Py_DECREF(obj);
482    }
483    return NULL;
484}
485
486PyObject *PyUnicode_Decode(const char *s,
487			   int size,
488			   const char *encoding,
489			   const char *errors)
490{
491    PyObject *buffer = NULL, *unicode;
492
493    if (encoding == NULL)
494	encoding = PyUnicode_GetDefaultEncoding();
495
496    /* Shortcuts for common default encodings */
497    if (strcmp(encoding, "utf-8") == 0)
498        return PyUnicode_DecodeUTF8(s, size, errors);
499    else if (strcmp(encoding, "latin-1") == 0)
500        return PyUnicode_DecodeLatin1(s, size, errors);
501    else if (strcmp(encoding, "ascii") == 0)
502        return PyUnicode_DecodeASCII(s, size, errors);
503
504    /* Decode via the codec registry */
505    buffer = PyBuffer_FromMemory((void *)s, size);
506    if (buffer == NULL)
507        goto onError;
508    unicode = PyCodec_Decode(buffer, encoding, errors);
509    if (unicode == NULL)
510        goto onError;
511    if (!PyUnicode_Check(unicode)) {
512        PyErr_Format(PyExc_TypeError,
513                     "decoder did not return an unicode object (type=%.400s)",
514                     unicode->ob_type->tp_name);
515        Py_DECREF(unicode);
516        goto onError;
517    }
518    Py_DECREF(buffer);
519    return unicode;
520
521 onError:
522    Py_XDECREF(buffer);
523    return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527			   int size,
528			   const char *encoding,
529			   const char *errors)
530{
531    PyObject *v, *unicode;
532
533    unicode = PyUnicode_FromUnicode(s, size);
534    if (unicode == NULL)
535	return NULL;
536    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537    Py_DECREF(unicode);
538    return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542                                    const char *encoding,
543                                    const char *errors)
544{
545    PyObject *v;
546
547    if (!PyUnicode_Check(unicode)) {
548        PyErr_BadArgument();
549        goto onError;
550    }
551
552    if (encoding == NULL)
553	encoding = PyUnicode_GetDefaultEncoding();
554
555    /* Shortcuts for common default encodings */
556    if (errors == NULL) {
557	if (strcmp(encoding, "utf-8") == 0)
558	    return PyUnicode_AsUTF8String(unicode);
559	else if (strcmp(encoding, "latin-1") == 0)
560	    return PyUnicode_AsLatin1String(unicode);
561	else if (strcmp(encoding, "ascii") == 0)
562	    return PyUnicode_AsASCIIString(unicode);
563    }
564
565    /* Encode via the codec registry */
566    v = PyCodec_Encode(unicode, encoding, errors);
567    if (v == NULL)
568        goto onError;
569    /* XXX Should we really enforce this ? */
570    if (!PyString_Check(v)) {
571        PyErr_Format(PyExc_TypeError,
572                     "encoder did not return a string object (type=%.400s)",
573                     v->ob_type->tp_name);
574        Py_DECREF(v);
575        goto onError;
576    }
577    return v;
578
579 onError:
580    return NULL;
581}
582
583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584					    const char *errors)
585{
586    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588    if (v)
589        return v;
590    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591    if (v && errors == NULL)
592        ((PyUnicodeObject *)unicode)->defenc = v;
593    return v;
594}
595
596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598    if (!PyUnicode_Check(unicode)) {
599        PyErr_BadArgument();
600        goto onError;
601    }
602    return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605    return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610    if (!PyUnicode_Check(unicode)) {
611        PyErr_BadArgument();
612        goto onError;
613    }
614    return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617    return -1;
618}
619
620const char *PyUnicode_GetDefaultEncoding(void)
621{
622    return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627    PyObject *v;
628
629    /* Make sure the encoding is valid. As side effect, this also
630       loads the encoding into the codec registry cache. */
631    v = _PyCodec_Lookup(encoding);
632    if (v == NULL)
633	goto onError;
634    Py_DECREF(v);
635    strncpy(unicode_default_encoding,
636	    encoding,
637	    sizeof(unicode_default_encoding));
638    return 0;
639
640 onError:
641    return -1;
642}
643
644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650    /* indicate whether a UTF-7 character is special i.e. cannot be directly
651       encoded:
652	   0 - not special
653	   1 - special
654	   2 - whitespace (optional)
655	   3 - RFC2152 Set O (optional) */
656    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668	(((c)>127 || utf7_special[(c)] == 1) || \
669	 (encodeWS && (utf7_special[(c)] == 2)) || \
670     (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675                        (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678    while (bits >= 6) { \
679        *out++ = B64(ch >> (bits-6)); \
680        bits -= 6; \
681    }
682
683#define DECODE(out, ch, bits, surrogate) \
684    while (bits >= 16) { \
685        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686        bits -= 16; \
687		if (surrogate) { \
688			/* We have already generated an error for the high surrogate
689               so let's not bother seeing if the low surrogate is correct or not */\
690			surrogate = 0; \
691		} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692            /* This is a surrogate pair. Unfortunately we can't represent \
693               it in a 16-bit character */ \
694			surrogate = 1; \
695            errmsg = "code pairs are not supported"; \
696	        goto utf7Error; \
697		} else { \
698				*out++ = outCh; \
699		} \
700    } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704                        const char *errors,
705                        const char *details)
706{
707    if ((errors == NULL) ||
708        (strcmp(errors,"strict") == 0)) {
709        PyErr_Format(PyExc_UnicodeError,
710                     "UTF-7 decoding error: %.400s",
711                     details);
712        return -1;
713    }
714    else if (strcmp(errors,"ignore") == 0) {
715        return 0;
716    }
717    else if (strcmp(errors,"replace") == 0) {
718        if (dest != NULL) {
719            **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720            (*dest)++;
721        }
722        return 0;
723    }
724    else {
725        PyErr_Format(PyExc_ValueError,
726                     "UTF-7 decoding error; unknown error handling code: %.400s",
727                     errors);
728        return -1;
729    }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733			       int size,
734			       const char *errors)
735{
736    const char *e;
737    PyUnicodeObject *unicode;
738    Py_UNICODE *p;
739    const char *errmsg = "";
740    int inShift = 0;
741    unsigned int bitsleft = 0;
742    unsigned long charsleft = 0;
743	int surrogate = 0;
744
745    unicode = _PyUnicode_New(size);
746    if (!unicode)
747        return NULL;
748    if (size == 0)
749        return (PyObject *)unicode;
750
751    p = unicode->str;
752    e = s + size;
753
754    while (s < e) {
755        Py_UNICODE ch = *s;
756
757        if (inShift) {
758            if ((ch == '-') || !B64CHAR(ch)) {
759                inShift = 0;
760                s++;
761
762                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763                if (bitsleft >= 6) {
764                    /* The shift sequence has a partial character in it. If
765                       bitsleft < 6 then we could just classify it as padding
766                       but that is not the case here */
767
768                    errmsg = "partial character in shift sequence";
769                    goto utf7Error;
770                }
771                /* According to RFC2152 the remaining bits should be zero. We
772                   choose to signal an error/insert a replacement character
773                   here so indicate the potential of a misencoded character. */
774
775                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777                    errmsg = "non-zero padding bits in shift sequence";
778                    goto utf7Error;
779                }
780
781                if (ch == '-') {
782                    if ((s < e) && (*(s) == '-')) {
783                        *p++ = '-';
784                        inShift = 1;
785                    }
786                } else if (SPECIAL(ch,0,0)) {
787                    errmsg = "unexpected special character";
788	                goto utf7Error;
789                } else  {
790                    *p++ = ch;
791                }
792            } else {
793                charsleft = (charsleft << 6) | UB64(ch);
794                bitsleft += 6;
795                s++;
796                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797            }
798        }
799        else if ( ch == '+' ) {
800            s++;
801            if (s < e && *s == '-') {
802                s++;
803                *p++ = '+';
804            } else
805            {
806                inShift = 1;
807                bitsleft = 0;
808            }
809        }
810        else if (SPECIAL(ch,0,0)) {
811            errmsg = "unexpected special character";
812            s++;
813	        goto utf7Error;
814        }
815        else {
816            *p++ = ch;
817            s++;
818        }
819        continue;
820    utf7Error:
821      if (utf7_decoding_error(&p, errors, errmsg))
822          goto onError;
823    }
824
825    if (inShift) {
826        if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827            goto onError;
828    }
829
830    if (_PyUnicode_Resize(&unicode, p - unicode->str))
831        goto onError;
832
833    return (PyObject *)unicode;
834
835onError:
836    Py_DECREF(unicode);
837    return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842                   int size,
843                   int encodeSetO,
844                   int encodeWhiteSpace,
845                   const char *errors)
846{
847    PyObject *v;
848    /* It might be possible to tighten this worst case */
849    unsigned int cbAllocated = 5 * size;
850    int inShift = 0;
851    int i = 0;
852    unsigned int bitsleft = 0;
853    unsigned long charsleft = 0;
854    char * out;
855    char * start;
856
857    if (size == 0)
858		return PyString_FromStringAndSize(NULL, 0);
859
860    v = PyString_FromStringAndSize(NULL, cbAllocated);
861    if (v == NULL)
862        return NULL;
863
864    start = out = PyString_AS_STRING(v);
865    for (;i < size; ++i) {
866        Py_UNICODE ch = s[i];
867
868        if (!inShift) {
869			if (ch == '+') {
870				*out++ = '+';
871                *out++ = '-';
872            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873                charsleft = ch;
874                bitsleft = 16;
875                *out++ = '+';
876				/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877                inShift = bitsleft > 0;
878			} else {
879				*out++ = (char) ch;
880			}
881		} else {
882            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883                *out++ = B64(charsleft << (6-bitsleft));
884                charsleft = 0;
885                bitsleft = 0;
886                /* Characters not in the BASE64 set implicitly unshift the sequence
887                   so no '-' is required, except if the character is itself a '-' */
888                if (B64CHAR(ch) || ch == '-') {
889                    *out++ = '-';
890                }
891                inShift = 0;
892                *out++ = (char) ch;
893            } else {
894                bitsleft += 16;
895                charsleft = (charsleft << 16) | ch;
896                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898                /* If the next character is special then we dont' need to terminate
899                   the shift sequence. If the next character is not a BASE64 character
900                   or '-' then the shift sequence will be terminated implicitly and we
901                   don't have to insert a '-'. */
902
903                if (bitsleft == 0) {
904                    if (i + 1 < size) {
905                        Py_UNICODE ch2 = s[i+1];
906
907                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909                        } else if (B64CHAR(ch2) || ch2 == '-') {
910                            *out++ = '-';
911                            inShift = 0;
912                        } else {
913                            inShift = 0;
914                        }
915
916                    }
917                    else {
918                        *out++ = '-';
919                        inShift = 0;
920                    }
921                }
922            }
923        }
924	}
925    if (bitsleft) {
926        *out++= B64(charsleft << (6-bitsleft) );
927        *out++ = '-';
928    }
929
930    if (_PyString_Resize(&v, out - start)) {
931        Py_DECREF(v);
932        return NULL;
933    }
934    return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
949       illegal prefix.  see RFC 2279 for details */
950    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970                        Py_UNICODE **dest,
971                        const char *errors,
972                        const char *details)
973{
974    if ((errors == NULL) ||
975        (strcmp(errors,"strict") == 0)) {
976        PyErr_Format(PyExc_UnicodeError,
977                     "UTF-8 decoding error: %.400s",
978                     details);
979        return -1;
980    }
981    else if (strcmp(errors,"ignore") == 0) {
982        (*source)++;
983        return 0;
984    }
985    else if (strcmp(errors,"replace") == 0) {
986        (*source)++;
987        **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988        (*dest)++;
989        return 0;
990    }
991    else {
992        PyErr_Format(PyExc_ValueError,
993                     "UTF-8 decoding error; unknown error handling code: %.400s",
994                     errors);
995        return -1;
996    }
997}
998
999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000			       int size,
1001			       const char *errors)
1002{
1003    int n;
1004    const char *e;
1005    PyUnicodeObject *unicode;
1006    Py_UNICODE *p;
1007    const char *errmsg = "";
1008
1009    /* Note: size will always be longer than the resulting Unicode
1010       character count */
1011    unicode = _PyUnicode_New(size);
1012    if (!unicode)
1013        return NULL;
1014    if (size == 0)
1015        return (PyObject *)unicode;
1016
1017    /* Unpack UTF-8 encoded data */
1018    p = unicode->str;
1019    e = s + size;
1020
1021    while (s < e) {
1022        Py_UCS4 ch = (unsigned char)*s;
1023
1024        if (ch < 0x80) {
1025            *p++ = (Py_UNICODE)ch;
1026            s++;
1027            continue;
1028        }
1029
1030        n = utf8_code_length[ch];
1031
1032        if (s + n > e) {
1033	    errmsg = "unexpected end of data";
1034	    goto utf8Error;
1035	}
1036
1037        switch (n) {
1038
1039        case 0:
1040            errmsg = "unexpected code byte";
1041	    goto utf8Error;
1042
1043        case 1:
1044            errmsg = "internal error";
1045	    goto utf8Error;
1046
1047        case 2:
1048            if ((s[1] & 0xc0) != 0x80) {
1049                errmsg = "invalid data";
1050		goto utf8Error;
1051	    }
1052            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1053            if (ch < 0x80) {
1054                errmsg = "illegal encoding";
1055		goto utf8Error;
1056	    }
1057	    else
1058		*p++ = (Py_UNICODE)ch;
1059            break;
1060
1061        case 3:
1062            if ((s[1] & 0xc0) != 0x80 ||
1063                (s[2] & 0xc0) != 0x80) {
1064                errmsg = "invalid data";
1065		goto utf8Error;
1066	    }
1067            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1068            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1069                errmsg = "illegal encoding";
1070		goto utf8Error;
1071	    }
1072	    else
1073				*p++ = (Py_UNICODE)ch;
1074            break;
1075
1076        case 4:
1077            if ((s[1] & 0xc0) != 0x80 ||
1078                (s[2] & 0xc0) != 0x80 ||
1079                (s[3] & 0xc0) != 0x80) {
1080                errmsg = "invalid data";
1081		goto utf8Error;
1082	    }
1083            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1084                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1085            /* validate and convert to UTF-16 */
1086            if ((ch < 0x10000)        /* minimum value allowed for 4
1087                                       byte encoding */
1088                || (ch > 0x10ffff))   /* maximum value allowed for
1089                                       UTF-16 */
1090	    {
1091                errmsg = "illegal encoding";
1092		goto utf8Error;
1093	    }
1094#ifdef Py_UNICODE_WIDE
1095	    *p++ = (Py_UNICODE)ch;
1096#else
1097            /*  compute and append the two surrogates: */
1098
1099            /*  translate from 10000..10FFFF to 0..FFFF */
1100            ch -= 0x10000;
1101
1102            /*  high surrogate = top 10 bits added to D800 */
1103            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1104
1105            /*  low surrogate = bottom 10 bits added to DC00 */
1106            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1107#endif
1108            break;
1109
1110        default:
1111            /* Other sizes are only needed for UCS-4 */
1112            errmsg = "unsupported Unicode code range";
1113	    goto utf8Error;
1114        }
1115        s += n;
1116	continue;
1117
1118    utf8Error:
1119      if (utf8_decoding_error(&s, &p, errors, errmsg))
1120          goto onError;
1121    }
1122
1123    /* Adjust length */
1124    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1125        goto onError;
1126
1127    return (PyObject *)unicode;
1128
1129onError:
1130    Py_DECREF(unicode);
1131    return NULL;
1132}
1133
1134/* Not used anymore, now that the encoder supports UTF-16
1135   surrogates. */
1136#if 0
1137static
1138int utf8_encoding_error(const Py_UNICODE **source,
1139			char **dest,
1140			const char *errors,
1141			const char *details)
1142{
1143    if ((errors == NULL) ||
1144	(strcmp(errors,"strict") == 0)) {
1145	PyErr_Format(PyExc_UnicodeError,
1146		     "UTF-8 encoding error: %.400s",
1147		     details);
1148	return -1;
1149    }
1150    else if (strcmp(errors,"ignore") == 0) {
1151	return 0;
1152    }
1153    else if (strcmp(errors,"replace") == 0) {
1154	**dest = '?';
1155	(*dest)++;
1156	return 0;
1157    }
1158    else {
1159	PyErr_Format(PyExc_ValueError,
1160		     "UTF-8 encoding error; "
1161		     "unknown error handling code: %.400s",
1162		     errors);
1163	return -1;
1164    }
1165}
1166#endif
1167
1168PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1169			       int size,
1170			       const char *errors)
1171{
1172    PyObject *v;
1173    char *p;
1174    unsigned int cbAllocated = 2 * size;
1175    unsigned int cbWritten = 0;
1176    int i = 0;
1177
1178    v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
1179    if (v == NULL)
1180        return NULL;
1181    if (size == 0)
1182        return v;
1183
1184    p = PyString_AS_STRING(v);
1185    while (i < size) {
1186        Py_UCS4 ch = s[i++];
1187
1188        if (ch < 0x80) {
1189            *p++ = (char) ch;
1190            cbWritten++;
1191        }
1192
1193        else if (ch < 0x0800) {
1194            *p++ = (char)(0xc0 | (ch >> 6));
1195            *p++ = (char)(0x80 | (ch & 0x3f));
1196            cbWritten += 2;
1197        }
1198
1199        else {
1200
1201	    /* Assure that we have enough room for high order Unicode
1202	       ordinals */
1203	    if (cbWritten >= cbAllocated) {
1204		cbAllocated += 4 * 10;
1205		if (_PyString_Resize(&v, cbAllocated + 4))
1206		    goto onError;
1207		p = PyString_AS_STRING(v) + cbWritten;
1208	    }
1209
1210	    if (ch < 0x10000) {
1211		/* Check for high surrogate */
1212		if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1213		    Py_UCS4 ch2 = s[i];
1214		    /* Check for low surrogate */
1215		    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1216                        ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1217                        *p++ = (char)((ch >> 18) | 0xf0);
1218                        *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1219			*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1220			*p++ = (char)(0x80 | (ch & 0x3f));
1221                        i++;
1222                        cbWritten += 4;
1223			continue;
1224                    }
1225		    /* Fall through: handles isolated high surrogates */
1226                }
1227                *p++ = (char)(0xe0 | (ch >> 12));
1228		*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1229		*p++ = (char)(0x80 | (ch & 0x3f));
1230		cbWritten += 3;
1231
1232	    } else {
1233		*p++ = (char)(0xf0 | (ch>>18));
1234		*p++ = (char)(0x80 | ((ch>>12) & 0x3f));
1235		*p++ = (char)(0x80 | ((ch>>6) & 0x3f));
1236		*p++ = (char)(0x80 | (ch & 0x3f));
1237		cbWritten += 4;
1238	    }
1239	}
1240    }
1241    *p = '\0';
1242    if (_PyString_Resize(&v, cbWritten))
1243	goto onError;
1244    return v;
1245
1246 onError:
1247    Py_DECREF(v);
1248    return NULL;
1249}
1250
1251PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1252{
1253    if (!PyUnicode_Check(unicode)) {
1254        PyErr_BadArgument();
1255        return NULL;
1256    }
1257    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1258				PyUnicode_GET_SIZE(unicode),
1259				NULL);
1260}
1261
1262/* --- UTF-16 Codec ------------------------------------------------------- */
1263
1264static
1265int utf16_decoding_error(Py_UNICODE **dest,
1266			 const char *errors,
1267			 const char *details)
1268{
1269    if ((errors == NULL) ||
1270        (strcmp(errors,"strict") == 0)) {
1271        PyErr_Format(PyExc_UnicodeError,
1272                     "UTF-16 decoding error: %.400s",
1273                     details);
1274        return -1;
1275    }
1276    else if (strcmp(errors,"ignore") == 0) {
1277        return 0;
1278    }
1279    else if (strcmp(errors,"replace") == 0) {
1280	if (dest) {
1281	    **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1282	    (*dest)++;
1283	}
1284        return 0;
1285    }
1286    else {
1287        PyErr_Format(PyExc_ValueError,
1288                     "UTF-16 decoding error; "
1289		     "unknown error handling code: %.400s",
1290                     errors);
1291        return -1;
1292    }
1293}
1294
1295PyObject *
1296PyUnicode_DecodeUTF16(const char *s,
1297		      int size,
1298		      const char *errors,
1299		      int *byteorder)
1300{
1301    PyUnicodeObject *unicode;
1302    Py_UNICODE *p;
1303    const unsigned char *q, *e;
1304    int bo = 0;       /* assume native ordering by default */
1305    const char *errmsg = "";
1306    /* Offsets from q for retrieving byte pairs in the right order. */
1307#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1308    int ihi = 1, ilo = 0;
1309#else
1310    int ihi = 0, ilo = 1;
1311#endif
1312
1313    /* size should be an even number */
1314    if (size & 1) {
1315        if (utf16_decoding_error(NULL, errors, "truncated data"))
1316            return NULL;
1317        --size;  /* else ignore the oddball byte */
1318    }
1319
1320    /* Note: size will always be longer than the resulting Unicode
1321       character count */
1322    unicode = _PyUnicode_New(size);
1323    if (!unicode)
1324        return NULL;
1325    if (size == 0)
1326        return (PyObject *)unicode;
1327
1328    /* Unpack UTF-16 encoded data */
1329    p = unicode->str;
1330    q = (unsigned char *)s;
1331    e = q + size;
1332
1333    if (byteorder)
1334        bo = *byteorder;
1335
1336    /* Check for BOM marks (U+FEFF) in the input and adjust current
1337       byte order setting accordingly. In native mode, the leading BOM
1338       mark is skipped, in all other modes, it is copied to the output
1339       stream as-is (giving a ZWNBSP character). */
1340    if (bo == 0) {
1341        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1342#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1343	if (bom == 0xFEFF) {
1344	    q += 2;
1345	    bo = -1;
1346	}
1347        else if (bom == 0xFFFE) {
1348	    q += 2;
1349	    bo = 1;
1350	}
1351#else
1352	if (bom == 0xFEFF) {
1353	    q += 2;
1354	    bo = 1;
1355	}
1356        else if (bom == 0xFFFE) {
1357	    q += 2;
1358	    bo = -1;
1359	}
1360#endif
1361    }
1362
1363    if (bo == -1) {
1364        /* force LE */
1365        ihi = 1;
1366        ilo = 0;
1367    }
1368    else if (bo == 1) {
1369        /* force BE */
1370        ihi = 0;
1371        ilo = 1;
1372    }
1373
1374    while (q < e) {
1375	Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1376	q += 2;
1377
1378	if (ch < 0xD800 || ch > 0xDFFF) {
1379	    *p++ = ch;
1380	    continue;
1381	}
1382
1383	/* UTF-16 code pair: */
1384	if (q >= e) {
1385	    errmsg = "unexpected end of data";
1386	    goto utf16Error;
1387	}
1388	if (0xD800 <= ch && ch <= 0xDBFF) {
1389	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1390	    q += 2;
1391	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1392#ifndef Py_UNICODE_WIDE
1393		*p++ = ch;
1394		*p++ = ch2;
1395#else
1396		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1397#endif
1398		continue;
1399	    }
1400	    else {
1401                errmsg = "illegal UTF-16 surrogate";
1402		goto utf16Error;
1403	    }
1404
1405	}
1406	errmsg = "illegal encoding";
1407	/* Fall through to report the error */
1408
1409    utf16Error:
1410	if (utf16_decoding_error(&p, errors, errmsg))
1411	    goto onError;
1412    }
1413
1414    if (byteorder)
1415        *byteorder = bo;
1416
1417    /* Adjust length */
1418    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1419        goto onError;
1420
1421    return (PyObject *)unicode;
1422
1423onError:
1424    Py_DECREF(unicode);
1425    return NULL;
1426}
1427
1428PyObject *
1429PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1430		      int size,
1431		      const char *errors,
1432		      int byteorder)
1433{
1434    PyObject *v;
1435    unsigned char *p;
1436    int i, pairs;
1437    /* Offsets from p for storing byte pairs in the right order. */
1438#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1439    int ihi = 1, ilo = 0;
1440#else
1441    int ihi = 0, ilo = 1;
1442#endif
1443
1444#define STORECHAR(CH)                   \
1445    do {                                \
1446        p[ihi] = ((CH) >> 8) & 0xff;    \
1447        p[ilo] = (CH) & 0xff;           \
1448        p += 2;                         \
1449    } while(0)
1450
1451    for (i = pairs = 0; i < size; i++)
1452	if (s[i] >= 0x10000)
1453	    pairs++;
1454    v = PyString_FromStringAndSize(NULL,
1455		  2 * (size + pairs + (byteorder == 0)));
1456    if (v == NULL)
1457        return NULL;
1458
1459    p = (unsigned char *)PyString_AS_STRING(v);
1460    if (byteorder == 0)
1461	STORECHAR(0xFEFF);
1462    if (size == 0)
1463        return v;
1464
1465    if (byteorder == -1) {
1466        /* force LE */
1467        ihi = 1;
1468        ilo = 0;
1469    }
1470    else if (byteorder == 1) {
1471        /* force BE */
1472        ihi = 0;
1473        ilo = 1;
1474    }
1475
1476    while (size-- > 0) {
1477	Py_UNICODE ch = *s++;
1478	Py_UNICODE ch2 = 0;
1479	if (ch >= 0x10000) {
1480	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1481	    ch  = 0xD800 | ((ch-0x10000) >> 10);
1482	}
1483        STORECHAR(ch);
1484        if (ch2)
1485            STORECHAR(ch2);
1486    }
1487    return v;
1488#undef STORECHAR
1489}
1490
1491PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1492{
1493    if (!PyUnicode_Check(unicode)) {
1494        PyErr_BadArgument();
1495        return NULL;
1496    }
1497    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1498				 PyUnicode_GET_SIZE(unicode),
1499				 NULL,
1500				 0);
1501}
1502
1503/* --- Unicode Escape Codec ----------------------------------------------- */
1504
1505static
1506int unicodeescape_decoding_error(const char **source,
1507                                 Py_UNICODE *x,
1508                                 const char *errors,
1509                                 const char *details)
1510{
1511    if ((errors == NULL) ||
1512        (strcmp(errors,"strict") == 0)) {
1513        PyErr_Format(PyExc_UnicodeError,
1514                     "Unicode-Escape decoding error: %.400s",
1515                     details);
1516        return -1;
1517    }
1518    else if (strcmp(errors,"ignore") == 0) {
1519        return 0;
1520    }
1521    else if (strcmp(errors,"replace") == 0) {
1522        *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1523        return 0;
1524    }
1525    else {
1526        PyErr_Format(PyExc_ValueError,
1527                     "Unicode-Escape decoding error; "
1528                     "unknown error handling code: %.400s",
1529                     errors);
1530        return -1;
1531    }
1532}
1533
1534static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1535
1536PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1537					int size,
1538					const char *errors)
1539{
1540    PyUnicodeObject *v;
1541    Py_UNICODE *p, *buf;
1542    const char *end;
1543    char* message;
1544    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1545
1546    /* Escaped strings will always be longer than the resulting
1547       Unicode string, so we start with size here and then reduce the
1548       length after conversion to the true value. */
1549    v = _PyUnicode_New(size);
1550    if (v == NULL)
1551        goto onError;
1552    if (size == 0)
1553        return (PyObject *)v;
1554
1555    p = buf = PyUnicode_AS_UNICODE(v);
1556    end = s + size;
1557
1558    while (s < end) {
1559        unsigned char c;
1560        Py_UNICODE x;
1561        int i, digits;
1562
1563        /* Non-escape characters are interpreted as Unicode ordinals */
1564        if (*s != '\\') {
1565            *p++ = (unsigned char) *s++;
1566            continue;
1567        }
1568
1569        /* \ - Escapes */
1570        s++;
1571        switch (*s++) {
1572
1573        /* \x escapes */
1574        case '\n': break;
1575        case '\\': *p++ = '\\'; break;
1576        case '\'': *p++ = '\''; break;
1577        case '\"': *p++ = '\"'; break;
1578        case 'b': *p++ = '\b'; break;
1579        case 'f': *p++ = '\014'; break; /* FF */
1580        case 't': *p++ = '\t'; break;
1581        case 'n': *p++ = '\n'; break;
1582        case 'r': *p++ = '\r'; break;
1583        case 'v': *p++ = '\013'; break; /* VT */
1584        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1585
1586        /* \OOO (octal) escapes */
1587        case '0': case '1': case '2': case '3':
1588        case '4': case '5': case '6': case '7':
1589            x = s[-1] - '0';
1590            if ('0' <= *s && *s <= '7') {
1591                x = (x<<3) + *s++ - '0';
1592                if ('0' <= *s && *s <= '7')
1593                    x = (x<<3) + *s++ - '0';
1594            }
1595            *p++ = x;
1596            break;
1597
1598        /* hex escapes */
1599        /* \xXX */
1600        case 'x':
1601            digits = 2;
1602            message = "truncated \\xXX escape";
1603            goto hexescape;
1604
1605        /* \uXXXX */
1606        case 'u':
1607            digits = 4;
1608            message = "truncated \\uXXXX escape";
1609            goto hexescape;
1610
1611        /* \UXXXXXXXX */
1612        case 'U':
1613            digits = 8;
1614            message = "truncated \\UXXXXXXXX escape";
1615        hexescape:
1616            chr = 0;
1617            for (i = 0; i < digits; i++) {
1618                c = (unsigned char) s[i];
1619                if (!isxdigit(c)) {
1620                    if (unicodeescape_decoding_error(&s, &x, errors, message))
1621                        goto onError;
1622                    chr = x;
1623                    i++;
1624                    break;
1625                }
1626                chr = (chr<<4) & ~0xF;
1627                if (c >= '0' && c <= '9')
1628                    chr += c - '0';
1629                else if (c >= 'a' && c <= 'f')
1630                    chr += 10 + c - 'a';
1631                else
1632                    chr += 10 + c - 'A';
1633            }
1634            s += i;
1635        store:
1636            /* when we get here, chr is a 32-bit unicode character */
1637            if (chr <= 0xffff)
1638                /* UCS-2 character */
1639                *p++ = (Py_UNICODE) chr;
1640            else if (chr <= 0x10ffff) {
1641                /* UCS-4 character. Either store directly, or as
1642		   surrogate pair. */
1643#ifdef Py_UNICODE_WIDE
1644                *p++ = chr;
1645#else
1646                chr -= 0x10000L;
1647                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1648                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1649#endif
1650            } else {
1651                if (unicodeescape_decoding_error(
1652                    &s, &x, errors,
1653                    "illegal Unicode character")
1654                    )
1655                    goto onError;
1656                *p++ = x; /* store replacement character */
1657            }
1658            break;
1659
1660        /* \N{name} */
1661        case 'N':
1662            message = "malformed \\N character escape";
1663            if (ucnhash_CAPI == NULL) {
1664                /* load the unicode data module */
1665                PyObject *m, *v;
1666                m = PyImport_ImportModule("unicodedata");
1667                if (m == NULL)
1668                    goto ucnhashError;
1669                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1670                Py_DECREF(m);
1671                if (v == NULL)
1672                    goto ucnhashError;
1673                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1674                Py_DECREF(v);
1675                if (ucnhash_CAPI == NULL)
1676                    goto ucnhashError;
1677            }
1678            if (*s == '{') {
1679                const char *start = s+1;
1680                /* look for the closing brace */
1681                while (*s != '}' && s < end)
1682                    s++;
1683                if (s > start && s < end && *s == '}') {
1684                    /* found a name.  look it up in the unicode database */
1685                    message = "unknown Unicode character name";
1686                    s++;
1687                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1688                        goto store;
1689                }
1690            }
1691            if (unicodeescape_decoding_error(&s, &x, errors, message))
1692                goto onError;
1693            *p++ = x;
1694            break;
1695
1696        default:
1697            *p++ = '\\';
1698            *p++ = (unsigned char)s[-1];
1699            break;
1700        }
1701    }
1702    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1703		goto onError;
1704    return (PyObject *)v;
1705
1706ucnhashError:
1707    PyErr_SetString(
1708        PyExc_UnicodeError,
1709        "\\N escapes not supported (can't load unicodedata module)"
1710        );
1711    return NULL;
1712
1713onError:
1714    Py_XDECREF(v);
1715    return NULL;
1716}
1717
1718/* Return a Unicode-Escape string version of the Unicode object.
1719
1720   If quotes is true, the string is enclosed in u"" or u'' quotes as
1721   appropriate.
1722
1723*/
1724
1725static const Py_UNICODE *findchar(const Py_UNICODE *s,
1726				  int size,
1727				  Py_UNICODE ch);
1728
1729static
1730PyObject *unicodeescape_string(const Py_UNICODE *s,
1731                               int size,
1732                               int quotes)
1733{
1734    PyObject *repr;
1735    char *p;
1736
1737    static const char *hexdigit = "0123456789abcdef";
1738
1739    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1740    if (repr == NULL)
1741        return NULL;
1742
1743    p = PyString_AS_STRING(repr);
1744
1745    if (quotes) {
1746        *p++ = 'u';
1747        *p++ = (findchar(s, size, '\'') &&
1748                !findchar(s, size, '"')) ? '"' : '\'';
1749    }
1750    while (size-- > 0) {
1751        Py_UNICODE ch = *s++;
1752
1753        /* Escape quotes */
1754        if (quotes &&
1755	    (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1756            *p++ = '\\';
1757            *p++ = (char) ch;
1758	    continue;
1759        }
1760
1761#ifdef Py_UNICODE_WIDE
1762        /* Map 21-bit characters to '\U00xxxxxx' */
1763        else if (ch >= 0x10000) {
1764	    int offset = p - PyString_AS_STRING(repr);
1765
1766	    /* Resize the string if necessary */
1767	    if (offset + 12 > PyString_GET_SIZE(repr)) {
1768		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1769		    goto onError;
1770		p = PyString_AS_STRING(repr) + offset;
1771	    }
1772
1773            *p++ = '\\';
1774            *p++ = 'U';
1775            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1776            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1777            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1778            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1779            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1780            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1781            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1782            *p++ = hexdigit[ch & 0x0000000F];
1783	    continue;
1784        }
1785#endif
1786	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1787	else if (ch >= 0xD800 && ch < 0xDC00) {
1788	    Py_UNICODE ch2;
1789	    Py_UCS4 ucs;
1790
1791	    ch2 = *s++;
1792	    size--;
1793	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1794		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1795		*p++ = '\\';
1796		*p++ = 'U';
1797		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1798		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1799		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1800		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1801		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1802		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1803		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1804		*p++ = hexdigit[ucs & 0x0000000F];
1805		continue;
1806	    }
1807	    /* Fall through: isolated surrogates are copied as-is */
1808	    s--;
1809	    size++;
1810	}
1811
1812        /* Map 16-bit characters to '\uxxxx' */
1813        if (ch >= 256) {
1814            *p++ = '\\';
1815            *p++ = 'u';
1816            *p++ = hexdigit[(ch >> 12) & 0x000F];
1817            *p++ = hexdigit[(ch >> 8) & 0x000F];
1818            *p++ = hexdigit[(ch >> 4) & 0x000F];
1819            *p++ = hexdigit[ch & 0x000F];
1820        }
1821
1822        /* Map special whitespace to '\t', \n', '\r' */
1823        else if (ch == '\t') {
1824            *p++ = '\\';
1825            *p++ = 't';
1826        }
1827        else if (ch == '\n') {
1828            *p++ = '\\';
1829            *p++ = 'n';
1830        }
1831        else if (ch == '\r') {
1832            *p++ = '\\';
1833            *p++ = 'r';
1834        }
1835
1836        /* Map non-printable US ASCII to '\xhh' */
1837        else if (ch < ' ' || ch >= 0x7F) {
1838            *p++ = '\\';
1839            *p++ = 'x';
1840            *p++ = hexdigit[(ch >> 4) & 0x000F];
1841            *p++ = hexdigit[ch & 0x000F];
1842        }
1843
1844        /* Copy everything else as-is */
1845        else
1846            *p++ = (char) ch;
1847    }
1848    if (quotes)
1849        *p++ = PyString_AS_STRING(repr)[1];
1850
1851    *p = '\0';
1852    if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1853	goto onError;
1854
1855    return repr;
1856
1857 onError:
1858    Py_DECREF(repr);
1859    return NULL;
1860}
1861
1862PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1863					int size)
1864{
1865    return unicodeescape_string(s, size, 0);
1866}
1867
1868PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1869{
1870    if (!PyUnicode_Check(unicode)) {
1871        PyErr_BadArgument();
1872        return NULL;
1873    }
1874    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1875					 PyUnicode_GET_SIZE(unicode));
1876}
1877
1878/* --- Raw Unicode Escape Codec ------------------------------------------- */
1879
1880PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1881					   int size,
1882					   const char *errors)
1883{
1884    PyUnicodeObject *v;
1885    Py_UNICODE *p, *buf;
1886    const char *end;
1887    const char *bs;
1888
1889    /* Escaped strings will always be longer than the resulting
1890       Unicode string, so we start with size here and then reduce the
1891       length after conversion to the true value. */
1892    v = _PyUnicode_New(size);
1893    if (v == NULL)
1894	goto onError;
1895    if (size == 0)
1896	return (PyObject *)v;
1897    p = buf = PyUnicode_AS_UNICODE(v);
1898    end = s + size;
1899    while (s < end) {
1900	unsigned char c;
1901	Py_UNICODE x;
1902	int i;
1903
1904	/* Non-escape characters are interpreted as Unicode ordinals */
1905	if (*s != '\\') {
1906	    *p++ = (unsigned char)*s++;
1907	    continue;
1908	}
1909
1910	/* \u-escapes are only interpreted iff the number of leading
1911	   backslashes if odd */
1912	bs = s;
1913	for (;s < end;) {
1914	    if (*s != '\\')
1915		break;
1916	    *p++ = (unsigned char)*s++;
1917	}
1918	if (((s - bs) & 1) == 0 ||
1919	    s >= end ||
1920	    *s != 'u') {
1921	    continue;
1922	}
1923	p--;
1924	s++;
1925
1926	/* \uXXXX with 4 hex digits */
1927	for (x = 0, i = 0; i < 4; i++) {
1928	    c = (unsigned char)s[i];
1929	    if (!isxdigit(c)) {
1930		if (unicodeescape_decoding_error(&s, &x, errors,
1931						 "truncated \\uXXXX"))
1932		    goto onError;
1933		i++;
1934		break;
1935	    }
1936	    x = (x<<4) & ~0xF;
1937	    if (c >= '0' && c <= '9')
1938		x += c - '0';
1939	    else if (c >= 'a' && c <= 'f')
1940		x += 10 + c - 'a';
1941	    else
1942		x += 10 + c - 'A';
1943	}
1944	s += i;
1945	*p++ = x;
1946    }
1947    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1948	goto onError;
1949    return (PyObject *)v;
1950
1951 onError:
1952    Py_XDECREF(v);
1953    return NULL;
1954}
1955
1956PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1957					   int size)
1958{
1959    PyObject *repr;
1960    char *p;
1961    char *q;
1962
1963    static const char *hexdigit = "0123456789abcdef";
1964
1965    repr = PyString_FromStringAndSize(NULL, 6 * size);
1966    if (repr == NULL)
1967        return NULL;
1968    if (size == 0)
1969	return repr;
1970
1971    p = q = PyString_AS_STRING(repr);
1972    while (size-- > 0) {
1973        Py_UNICODE ch = *s++;
1974	/* Map 16-bit characters to '\uxxxx' */
1975	if (ch >= 256) {
1976            *p++ = '\\';
1977            *p++ = 'u';
1978            *p++ = hexdigit[(ch >> 12) & 0xf];
1979            *p++ = hexdigit[(ch >> 8) & 0xf];
1980            *p++ = hexdigit[(ch >> 4) & 0xf];
1981            *p++ = hexdigit[ch & 15];
1982        }
1983	/* Copy everything else as-is */
1984	else
1985            *p++ = (char) ch;
1986    }
1987    *p = '\0';
1988    if (_PyString_Resize(&repr, p - q))
1989	goto onError;
1990
1991    return repr;
1992
1993 onError:
1994    Py_DECREF(repr);
1995    return NULL;
1996}
1997
1998PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1999{
2000    if (!PyUnicode_Check(unicode)) {
2001	PyErr_BadArgument();
2002	return NULL;
2003    }
2004    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2005					    PyUnicode_GET_SIZE(unicode));
2006}
2007
2008/* --- Latin-1 Codec ------------------------------------------------------ */
2009
2010PyObject *PyUnicode_DecodeLatin1(const char *s,
2011				 int size,
2012				 const char *errors)
2013{
2014    PyUnicodeObject *v;
2015    Py_UNICODE *p;
2016
2017    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2018    if (size == 1 && *(unsigned char*)s < 256) {
2019	Py_UNICODE r = *(unsigned char*)s;
2020	return PyUnicode_FromUnicode(&r, 1);
2021    }
2022
2023    v = _PyUnicode_New(size);
2024    if (v == NULL)
2025	goto onError;
2026    if (size == 0)
2027	return (PyObject *)v;
2028    p = PyUnicode_AS_UNICODE(v);
2029    while (size-- > 0)
2030	*p++ = (unsigned char)*s++;
2031    return (PyObject *)v;
2032
2033 onError:
2034    Py_XDECREF(v);
2035    return NULL;
2036}
2037
2038static
2039int latin1_encoding_error(const Py_UNICODE **source,
2040			  char **dest,
2041			  const char *errors,
2042			  const char *details)
2043{
2044    if ((errors == NULL) ||
2045	(strcmp(errors,"strict") == 0)) {
2046	PyErr_Format(PyExc_UnicodeError,
2047		     "Latin-1 encoding error: %.400s",
2048		     details);
2049	return -1;
2050    }
2051    else if (strcmp(errors,"ignore") == 0) {
2052	return 0;
2053    }
2054    else if (strcmp(errors,"replace") == 0) {
2055	**dest = '?';
2056	(*dest)++;
2057	return 0;
2058    }
2059    else {
2060	PyErr_Format(PyExc_ValueError,
2061		     "Latin-1 encoding error; "
2062		     "unknown error handling code: %.400s",
2063		     errors);
2064	return -1;
2065    }
2066}
2067
2068PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2069				 int size,
2070				 const char *errors)
2071{
2072    PyObject *repr;
2073    char *s, *start;
2074
2075    repr = PyString_FromStringAndSize(NULL, size);
2076    if (repr == NULL)
2077        return NULL;
2078    if (size == 0)
2079	return repr;
2080
2081    s = PyString_AS_STRING(repr);
2082    start = s;
2083    while (size-- > 0) {
2084        Py_UNICODE ch = *p++;
2085	if (ch >= 256) {
2086	    if (latin1_encoding_error(&p, &s, errors,
2087				      "ordinal not in range(256)"))
2088		goto onError;
2089	}
2090	else
2091            *s++ = (char)ch;
2092    }
2093    /* Resize if error handling skipped some characters */
2094    if (s - start < PyString_GET_SIZE(repr))
2095	if (_PyString_Resize(&repr, s - start))
2096	    goto onError;
2097    return repr;
2098
2099 onError:
2100    Py_DECREF(repr);
2101    return NULL;
2102}
2103
2104PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2105{
2106    if (!PyUnicode_Check(unicode)) {
2107	PyErr_BadArgument();
2108	return NULL;
2109    }
2110    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2111				  PyUnicode_GET_SIZE(unicode),
2112				  NULL);
2113}
2114
2115/* --- 7-bit ASCII Codec -------------------------------------------------- */
2116
2117static
2118int ascii_decoding_error(const char **source,
2119			 Py_UNICODE **dest,
2120			 const char *errors,
2121			 const char *details)
2122{
2123    if ((errors == NULL) ||
2124	(strcmp(errors,"strict") == 0)) {
2125	PyErr_Format(PyExc_UnicodeError,
2126		     "ASCII decoding error: %.400s",
2127		     details);
2128	return -1;
2129    }
2130    else if (strcmp(errors,"ignore") == 0) {
2131	return 0;
2132    }
2133    else if (strcmp(errors,"replace") == 0) {
2134	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2135	(*dest)++;
2136	return 0;
2137    }
2138    else {
2139	PyErr_Format(PyExc_ValueError,
2140		     "ASCII decoding error; "
2141		     "unknown error handling code: %.400s",
2142		     errors);
2143	return -1;
2144    }
2145}
2146
2147PyObject *PyUnicode_DecodeASCII(const char *s,
2148				int size,
2149				const char *errors)
2150{
2151    PyUnicodeObject *v;
2152    Py_UNICODE *p;
2153
2154    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2155    if (size == 1 && *(unsigned char*)s < 128) {
2156	Py_UNICODE r = *(unsigned char*)s;
2157	return PyUnicode_FromUnicode(&r, 1);
2158    }
2159
2160    v = _PyUnicode_New(size);
2161    if (v == NULL)
2162	goto onError;
2163    if (size == 0)
2164	return (PyObject *)v;
2165    p = PyUnicode_AS_UNICODE(v);
2166    while (size-- > 0) {
2167	register unsigned char c;
2168
2169	c = (unsigned char)*s++;
2170	if (c < 128)
2171	    *p++ = c;
2172	else if (ascii_decoding_error(&s, &p, errors,
2173				      "ordinal not in range(128)"))
2174		goto onError;
2175    }
2176    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2177	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2178	    goto onError;
2179    return (PyObject *)v;
2180
2181 onError:
2182    Py_XDECREF(v);
2183    return NULL;
2184}
2185
2186static
2187int ascii_encoding_error(const Py_UNICODE **source,
2188			 char **dest,
2189			 const char *errors,
2190			 const char *details)
2191{
2192    if ((errors == NULL) ||
2193	(strcmp(errors,"strict") == 0)) {
2194	PyErr_Format(PyExc_UnicodeError,
2195		     "ASCII encoding error: %.400s",
2196		     details);
2197	return -1;
2198    }
2199    else if (strcmp(errors,"ignore") == 0) {
2200	return 0;
2201    }
2202    else if (strcmp(errors,"replace") == 0) {
2203	**dest = '?';
2204	(*dest)++;
2205	return 0;
2206    }
2207    else {
2208	PyErr_Format(PyExc_ValueError,
2209		     "ASCII encoding error; "
2210		     "unknown error handling code: %.400s",
2211		     errors);
2212	return -1;
2213    }
2214}
2215
2216PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2217				int size,
2218				const char *errors)
2219{
2220    PyObject *repr;
2221    char *s, *start;
2222
2223    repr = PyString_FromStringAndSize(NULL, size);
2224    if (repr == NULL)
2225        return NULL;
2226    if (size == 0)
2227	return repr;
2228
2229    s = PyString_AS_STRING(repr);
2230    start = s;
2231    while (size-- > 0) {
2232        Py_UNICODE ch = *p++;
2233	if (ch >= 128) {
2234	    if (ascii_encoding_error(&p, &s, errors,
2235				      "ordinal not in range(128)"))
2236		goto onError;
2237	}
2238	else
2239            *s++ = (char)ch;
2240    }
2241    /* Resize if error handling skipped some characters */
2242    if (s - start < PyString_GET_SIZE(repr))
2243	if (_PyString_Resize(&repr, s - start))
2244	    goto onError;
2245    return repr;
2246
2247 onError:
2248    Py_DECREF(repr);
2249    return NULL;
2250}
2251
2252PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2253{
2254    if (!PyUnicode_Check(unicode)) {
2255	PyErr_BadArgument();
2256	return NULL;
2257    }
2258    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2259				 PyUnicode_GET_SIZE(unicode),
2260				 NULL);
2261}
2262
2263#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2264
2265/* --- MBCS codecs for Windows -------------------------------------------- */
2266
2267PyObject *PyUnicode_DecodeMBCS(const char *s,
2268				int size,
2269				const char *errors)
2270{
2271    PyUnicodeObject *v;
2272    Py_UNICODE *p;
2273
2274    /* First get the size of the result */
2275    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2276    if (size > 0 && usize==0)
2277        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2278
2279    v = _PyUnicode_New(usize);
2280    if (v == NULL)
2281        return NULL;
2282    if (usize == 0)
2283	return (PyObject *)v;
2284    p = PyUnicode_AS_UNICODE(v);
2285    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2286        Py_DECREF(v);
2287        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2288    }
2289
2290    return (PyObject *)v;
2291}
2292
2293PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2294				int size,
2295				const char *errors)
2296{
2297    PyObject *repr;
2298    char *s;
2299    DWORD mbcssize;
2300
2301    /* If there are no characters, bail now! */
2302    if (size==0)
2303	    return PyString_FromString("");
2304
2305    /* First get the size of the result */
2306    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2307    if (mbcssize==0)
2308        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2309
2310    repr = PyString_FromStringAndSize(NULL, mbcssize);
2311    if (repr == NULL)
2312        return NULL;
2313    if (mbcssize == 0)
2314        return repr;
2315
2316    /* Do the conversion */
2317    s = PyString_AS_STRING(repr);
2318    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2319        Py_DECREF(repr);
2320        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2321    }
2322    return repr;
2323}
2324
2325#endif /* MS_WIN32 */
2326
2327/* --- Character Mapping Codec -------------------------------------------- */
2328
2329static
2330int charmap_decoding_error(const char **source,
2331			 Py_UNICODE **dest,
2332			 const char *errors,
2333			 const char *details)
2334{
2335    if ((errors == NULL) ||
2336	(strcmp(errors,"strict") == 0)) {
2337	PyErr_Format(PyExc_UnicodeError,
2338		     "charmap decoding error: %.400s",
2339		     details);
2340	return -1;
2341    }
2342    else if (strcmp(errors,"ignore") == 0) {
2343	return 0;
2344    }
2345    else if (strcmp(errors,"replace") == 0) {
2346	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2347	(*dest)++;
2348	return 0;
2349    }
2350    else {
2351	PyErr_Format(PyExc_ValueError,
2352		     "charmap decoding error; "
2353		     "unknown error handling code: %.400s",
2354		     errors);
2355	return -1;
2356    }
2357}
2358
2359PyObject *PyUnicode_DecodeCharmap(const char *s,
2360				  int size,
2361				  PyObject *mapping,
2362				  const char *errors)
2363{
2364    PyUnicodeObject *v;
2365    Py_UNICODE *p;
2366    int extrachars = 0;
2367
2368    /* Default to Latin-1 */
2369    if (mapping == NULL)
2370	return PyUnicode_DecodeLatin1(s, size, errors);
2371
2372    v = _PyUnicode_New(size);
2373    if (v == NULL)
2374	goto onError;
2375    if (size == 0)
2376	return (PyObject *)v;
2377    p = PyUnicode_AS_UNICODE(v);
2378    while (size-- > 0) {
2379	unsigned char ch = *s++;
2380	PyObject *w, *x;
2381
2382	/* Get mapping (char ordinal -> integer, Unicode char or None) */
2383	w = PyInt_FromLong((long)ch);
2384	if (w == NULL)
2385	    goto onError;
2386	x = PyObject_GetItem(mapping, w);
2387	Py_DECREF(w);
2388	if (x == NULL) {
2389	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2390		/* No mapping found means: mapping is undefined. */
2391		PyErr_Clear();
2392		x = Py_None;
2393		Py_INCREF(x);
2394	    } else
2395		goto onError;
2396	}
2397
2398	/* Apply mapping */
2399	if (PyInt_Check(x)) {
2400	    long value = PyInt_AS_LONG(x);
2401	    if (value < 0 || value > 65535) {
2402		PyErr_SetString(PyExc_TypeError,
2403				"character mapping must be in range(65536)");
2404		Py_DECREF(x);
2405		goto onError;
2406	    }
2407	    *p++ = (Py_UNICODE)value;
2408	}
2409	else if (x == Py_None) {
2410	    /* undefined mapping */
2411	    if (charmap_decoding_error(&s, &p, errors,
2412				       "character maps to <undefined>")) {
2413		Py_DECREF(x);
2414		goto onError;
2415	    }
2416	}
2417	else if (PyUnicode_Check(x)) {
2418	    int targetsize = PyUnicode_GET_SIZE(x);
2419
2420	    if (targetsize == 1)
2421		/* 1-1 mapping */
2422		*p++ = *PyUnicode_AS_UNICODE(x);
2423
2424	    else if (targetsize > 1) {
2425		/* 1-n mapping */
2426		if (targetsize > extrachars) {
2427		    /* resize first */
2428		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2429		    int needed = (targetsize - extrachars) + \
2430			         (targetsize << 2);
2431		    extrachars += needed;
2432		    if (_PyUnicode_Resize(&v,
2433					 PyUnicode_GET_SIZE(v) + needed)) {
2434			Py_DECREF(x);
2435			goto onError;
2436		    }
2437		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2438		}
2439		Py_UNICODE_COPY(p,
2440				PyUnicode_AS_UNICODE(x),
2441				targetsize);
2442		p += targetsize;
2443		extrachars -= targetsize;
2444	    }
2445	    /* 1-0 mapping: skip the character */
2446	}
2447	else {
2448	    /* wrong return value */
2449	    PyErr_SetString(PyExc_TypeError,
2450		  "character mapping must return integer, None or unicode");
2451	    Py_DECREF(x);
2452	    goto onError;
2453	}
2454	Py_DECREF(x);
2455    }
2456    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2457	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2458	    goto onError;
2459    return (PyObject *)v;
2460
2461 onError:
2462    Py_XDECREF(v);
2463    return NULL;
2464}
2465
2466static
2467int charmap_encoding_error(const Py_UNICODE **source,
2468			   char **dest,
2469			   const char *errors,
2470			   const char *details)
2471{
2472    if ((errors == NULL) ||
2473	(strcmp(errors,"strict") == 0)) {
2474	PyErr_Format(PyExc_UnicodeError,
2475		     "charmap encoding error: %.400s",
2476		     details);
2477	return -1;
2478    }
2479    else if (strcmp(errors,"ignore") == 0) {
2480	return 0;
2481    }
2482    else if (strcmp(errors,"replace") == 0) {
2483	**dest = '?';
2484	(*dest)++;
2485	return 0;
2486    }
2487    else {
2488	PyErr_Format(PyExc_ValueError,
2489		     "charmap encoding error; "
2490		     "unknown error handling code: %.400s",
2491		     errors);
2492	return -1;
2493    }
2494}
2495
2496PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2497				  int size,
2498				  PyObject *mapping,
2499				  const char *errors)
2500{
2501    PyObject *v;
2502    char *s;
2503    int extrachars = 0;
2504
2505    /* Default to Latin-1 */
2506    if (mapping == NULL)
2507	return PyUnicode_EncodeLatin1(p, size, errors);
2508
2509    v = PyString_FromStringAndSize(NULL, size);
2510    if (v == NULL)
2511        return NULL;
2512    if (size == 0)
2513	return v;
2514    s = PyString_AS_STRING(v);
2515    while (size-- > 0) {
2516	Py_UNICODE ch = *p++;
2517	PyObject *w, *x;
2518
2519	/* Get mapping (Unicode ordinal -> string char, integer or None) */
2520	w = PyInt_FromLong((long)ch);
2521	if (w == NULL)
2522	    goto onError;
2523	x = PyObject_GetItem(mapping, w);
2524	Py_DECREF(w);
2525	if (x == NULL) {
2526	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2527		/* No mapping found means: mapping is undefined. */
2528		PyErr_Clear();
2529		x = Py_None;
2530		Py_INCREF(x);
2531	    } else
2532		goto onError;
2533	}
2534
2535	/* Apply mapping */
2536	if (PyInt_Check(x)) {
2537	    long value = PyInt_AS_LONG(x);
2538	    if (value < 0 || value > 255) {
2539		PyErr_SetString(PyExc_TypeError,
2540				"character mapping must be in range(256)");
2541		Py_DECREF(x);
2542		goto onError;
2543	    }
2544	    *s++ = (char)value;
2545	}
2546	else if (x == Py_None) {
2547	    /* undefined mapping */
2548	    if (charmap_encoding_error(&p, &s, errors,
2549				       "character maps to <undefined>")) {
2550		Py_DECREF(x);
2551		goto onError;
2552	    }
2553	}
2554	else if (PyString_Check(x)) {
2555	    int targetsize = PyString_GET_SIZE(x);
2556
2557	    if (targetsize == 1)
2558		/* 1-1 mapping */
2559		*s++ = *PyString_AS_STRING(x);
2560
2561	    else if (targetsize > 1) {
2562		/* 1-n mapping */
2563		if (targetsize > extrachars) {
2564		    /* resize first */
2565		    int oldpos = (int)(s - PyString_AS_STRING(v));
2566		    int needed = (targetsize - extrachars) + \
2567			         (targetsize << 2);
2568		    extrachars += needed;
2569		    if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2570			Py_DECREF(x);
2571			goto onError;
2572		    }
2573		    s = PyString_AS_STRING(v) + oldpos;
2574		}
2575		memcpy(s, PyString_AS_STRING(x), targetsize);
2576		s += targetsize;
2577		extrachars -= targetsize;
2578	    }
2579	    /* 1-0 mapping: skip the character */
2580	}
2581	else {
2582	    /* wrong return value */
2583	    PyErr_SetString(PyExc_TypeError,
2584		  "character mapping must return integer, None or unicode");
2585	    Py_DECREF(x);
2586	    goto onError;
2587	}
2588	Py_DECREF(x);
2589    }
2590    if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2591	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2592	    goto onError;
2593    return v;
2594
2595 onError:
2596    Py_DECREF(v);
2597    return NULL;
2598}
2599
2600PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2601				    PyObject *mapping)
2602{
2603    if (!PyUnicode_Check(unicode) || mapping == NULL) {
2604	PyErr_BadArgument();
2605	return NULL;
2606    }
2607    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2608				   PyUnicode_GET_SIZE(unicode),
2609				   mapping,
2610				   NULL);
2611}
2612
2613static
2614int translate_error(const Py_UNICODE **source,
2615		    Py_UNICODE **dest,
2616		    const char *errors,
2617		    const char *details)
2618{
2619    if ((errors == NULL) ||
2620	(strcmp(errors,"strict") == 0)) {
2621	PyErr_Format(PyExc_UnicodeError,
2622		     "translate error: %.400s",
2623		     details);
2624	return -1;
2625    }
2626    else if (strcmp(errors,"ignore") == 0) {
2627	return 0;
2628    }
2629    else if (strcmp(errors,"replace") == 0) {
2630	**dest = '?';
2631	(*dest)++;
2632	return 0;
2633    }
2634    else {
2635	PyErr_Format(PyExc_ValueError,
2636		     "translate error; "
2637		     "unknown error handling code: %.400s",
2638		     errors);
2639	return -1;
2640    }
2641}
2642
2643PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2644				     int size,
2645				     PyObject *mapping,
2646				     const char *errors)
2647{
2648    PyUnicodeObject *v;
2649    Py_UNICODE *p;
2650
2651    if (mapping == NULL) {
2652	PyErr_BadArgument();
2653	return NULL;
2654    }
2655
2656    /* Output will never be longer than input */
2657    v = _PyUnicode_New(size);
2658    if (v == NULL)
2659	goto onError;
2660    if (size == 0)
2661	goto done;
2662    p = PyUnicode_AS_UNICODE(v);
2663    while (size-- > 0) {
2664	Py_UNICODE ch = *s++;
2665	PyObject *w, *x;
2666
2667	/* Get mapping */
2668	w = PyInt_FromLong(ch);
2669	if (w == NULL)
2670	    goto onError;
2671	x = PyObject_GetItem(mapping, w);
2672	Py_DECREF(w);
2673	if (x == NULL) {
2674	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2675		/* No mapping found: default to 1-1 mapping */
2676		PyErr_Clear();
2677		*p++ = ch;
2678		continue;
2679	    }
2680	    goto onError;
2681	}
2682
2683	/* Apply mapping */
2684	if (PyInt_Check(x))
2685	    *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2686	else if (x == Py_None) {
2687	    /* undefined mapping */
2688	    if (translate_error(&s, &p, errors,
2689				"character maps to <undefined>")) {
2690		Py_DECREF(x);
2691		goto onError;
2692	    }
2693	}
2694	else if (PyUnicode_Check(x)) {
2695	    if (PyUnicode_GET_SIZE(x) != 1) {
2696		/* 1-n mapping */
2697		PyErr_SetString(PyExc_NotImplementedError,
2698				"1-n mappings are currently not implemented");
2699		Py_DECREF(x);
2700		goto onError;
2701	    }
2702	    *p++ = *PyUnicode_AS_UNICODE(x);
2703	}
2704	else {
2705	    /* wrong return value */
2706	    PyErr_SetString(PyExc_TypeError,
2707		  "translate mapping must return integer, None or unicode");
2708	    Py_DECREF(x);
2709	    goto onError;
2710	}
2711	Py_DECREF(x);
2712    }
2713    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2714	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2715	    goto onError;
2716
2717 done:
2718    return (PyObject *)v;
2719
2720 onError:
2721    Py_XDECREF(v);
2722    return NULL;
2723}
2724
2725PyObject *PyUnicode_Translate(PyObject *str,
2726			      PyObject *mapping,
2727			      const char *errors)
2728{
2729    PyObject *result;
2730
2731    str = PyUnicode_FromObject(str);
2732    if (str == NULL)
2733	goto onError;
2734    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2735					PyUnicode_GET_SIZE(str),
2736					mapping,
2737					errors);
2738    Py_DECREF(str);
2739    return result;
2740
2741 onError:
2742    Py_XDECREF(str);
2743    return NULL;
2744}
2745
2746/* --- Decimal Encoder ---------------------------------------------------- */
2747
2748int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2749			    int length,
2750			    char *output,
2751			    const char *errors)
2752{
2753    Py_UNICODE *p, *end;
2754
2755    if (output == NULL) {
2756	PyErr_BadArgument();
2757	return -1;
2758    }
2759
2760    p = s;
2761    end = s + length;
2762    while (p < end) {
2763	register Py_UNICODE ch = *p++;
2764	int decimal;
2765
2766	if (Py_UNICODE_ISSPACE(ch)) {
2767	    *output++ = ' ';
2768	    continue;
2769	}
2770	decimal = Py_UNICODE_TODECIMAL(ch);
2771	if (decimal >= 0) {
2772	    *output++ = '0' + decimal;
2773	    continue;
2774	}
2775	if (0 < ch && ch < 256) {
2776	    *output++ = (char)ch;
2777	    continue;
2778	}
2779	/* All other characters are considered invalid */
2780	if (errors == NULL || strcmp(errors, "strict") == 0) {
2781	    PyErr_SetString(PyExc_ValueError,
2782			    "invalid decimal Unicode string");
2783	    goto onError;
2784	}
2785	else if (strcmp(errors, "ignore") == 0)
2786	    continue;
2787	else if (strcmp(errors, "replace") == 0) {
2788	    *output++ = '?';
2789	    continue;
2790	}
2791    }
2792    /* 0-terminate the output string */
2793    *output++ = '\0';
2794    return 0;
2795
2796 onError:
2797    return -1;
2798}
2799
2800/* --- Helpers ------------------------------------------------------------ */
2801
2802static
2803int count(PyUnicodeObject *self,
2804	  int start,
2805	  int end,
2806	  PyUnicodeObject *substring)
2807{
2808    int count = 0;
2809
2810    if (start < 0)
2811        start += self->length;
2812    if (start < 0)
2813        start = 0;
2814    if (end > self->length)
2815        end = self->length;
2816    if (end < 0)
2817        end += self->length;
2818    if (end < 0)
2819        end = 0;
2820
2821    if (substring->length == 0)
2822	return (end - start + 1);
2823
2824    end -= substring->length;
2825
2826    while (start <= end)
2827        if (Py_UNICODE_MATCH(self, start, substring)) {
2828            count++;
2829            start += substring->length;
2830        } else
2831            start++;
2832
2833    return count;
2834}
2835
2836int PyUnicode_Count(PyObject *str,
2837		    PyObject *substr,
2838		    int start,
2839		    int end)
2840{
2841    int result;
2842
2843    str = PyUnicode_FromObject(str);
2844    if (str == NULL)
2845	return -1;
2846    substr = PyUnicode_FromObject(substr);
2847    if (substr == NULL) {
2848	Py_DECREF(str);
2849	return -1;
2850    }
2851
2852    result = count((PyUnicodeObject *)str,
2853		   start, end,
2854		   (PyUnicodeObject *)substr);
2855
2856    Py_DECREF(str);
2857    Py_DECREF(substr);
2858    return result;
2859}
2860
2861static
2862int findstring(PyUnicodeObject *self,
2863	       PyUnicodeObject *substring,
2864	       int start,
2865	       int end,
2866	       int direction)
2867{
2868    if (start < 0)
2869        start += self->length;
2870    if (start < 0)
2871        start = 0;
2872
2873    if (substring->length == 0)
2874        return start;
2875
2876    if (end > self->length)
2877        end = self->length;
2878    if (end < 0)
2879        end += self->length;
2880    if (end < 0)
2881        end = 0;
2882
2883    end -= substring->length;
2884
2885    if (direction < 0) {
2886        for (; end >= start; end--)
2887            if (Py_UNICODE_MATCH(self, end, substring))
2888                return end;
2889    } else {
2890        for (; start <= end; start++)
2891            if (Py_UNICODE_MATCH(self, start, substring))
2892                return start;
2893    }
2894
2895    return -1;
2896}
2897
2898int PyUnicode_Find(PyObject *str,
2899		   PyObject *substr,
2900		   int start,
2901		   int end,
2902		   int direction)
2903{
2904    int result;
2905
2906    str = PyUnicode_FromObject(str);
2907    if (str == NULL)
2908	return -1;
2909    substr = PyUnicode_FromObject(substr);
2910    if (substr == NULL) {
2911	Py_DECREF(substr);
2912	return -1;
2913    }
2914
2915    result = findstring((PyUnicodeObject *)str,
2916			(PyUnicodeObject *)substr,
2917			start, end, direction);
2918    Py_DECREF(str);
2919    Py_DECREF(substr);
2920    return result;
2921}
2922
2923static
2924int tailmatch(PyUnicodeObject *self,
2925	      PyUnicodeObject *substring,
2926	      int start,
2927	      int end,
2928	      int direction)
2929{
2930    if (start < 0)
2931        start += self->length;
2932    if (start < 0)
2933        start = 0;
2934
2935    if (substring->length == 0)
2936        return 1;
2937
2938    if (end > self->length)
2939        end = self->length;
2940    if (end < 0)
2941        end += self->length;
2942    if (end < 0)
2943        end = 0;
2944
2945    end -= substring->length;
2946    if (end < start)
2947	return 0;
2948
2949    if (direction > 0) {
2950	if (Py_UNICODE_MATCH(self, end, substring))
2951	    return 1;
2952    } else {
2953        if (Py_UNICODE_MATCH(self, start, substring))
2954	    return 1;
2955    }
2956
2957    return 0;
2958}
2959
2960int PyUnicode_Tailmatch(PyObject *str,
2961			PyObject *substr,
2962			int start,
2963			int end,
2964			int direction)
2965{
2966    int result;
2967
2968    str = PyUnicode_FromObject(str);
2969    if (str == NULL)
2970	return -1;
2971    substr = PyUnicode_FromObject(substr);
2972    if (substr == NULL) {
2973	Py_DECREF(substr);
2974	return -1;
2975    }
2976
2977    result = tailmatch((PyUnicodeObject *)str,
2978		       (PyUnicodeObject *)substr,
2979		       start, end, direction);
2980    Py_DECREF(str);
2981    Py_DECREF(substr);
2982    return result;
2983}
2984
2985static
2986const Py_UNICODE *findchar(const Py_UNICODE *s,
2987		     int size,
2988		     Py_UNICODE ch)
2989{
2990    /* like wcschr, but doesn't stop at NULL characters */
2991
2992    while (size-- > 0) {
2993        if (*s == ch)
2994            return s;
2995        s++;
2996    }
2997
2998    return NULL;
2999}
3000
3001/* Apply fixfct filter to the Unicode object self and return a
3002   reference to the modified object */
3003
3004static
3005PyObject *fixup(PyUnicodeObject *self,
3006		int (*fixfct)(PyUnicodeObject *s))
3007{
3008
3009    PyUnicodeObject *u;
3010
3011    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3012    if (u == NULL)
3013	return NULL;
3014
3015    Py_UNICODE_COPY(u->str, self->str, self->length);
3016
3017    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3018	/* fixfct should return TRUE if it modified the buffer. If
3019	   FALSE, return a reference to the original buffer instead
3020	   (to save space, not time) */
3021	Py_INCREF(self);
3022	Py_DECREF(u);
3023	return (PyObject*) self;
3024    }
3025    return (PyObject*) u;
3026}
3027
3028static
3029int fixupper(PyUnicodeObject *self)
3030{
3031    int len = self->length;
3032    Py_UNICODE *s = self->str;
3033    int status = 0;
3034
3035    while (len-- > 0) {
3036	register Py_UNICODE ch;
3037
3038	ch = Py_UNICODE_TOUPPER(*s);
3039	if (ch != *s) {
3040            status = 1;
3041	    *s = ch;
3042	}
3043        s++;
3044    }
3045
3046    return status;
3047}
3048
3049static
3050int fixlower(PyUnicodeObject *self)
3051{
3052    int len = self->length;
3053    Py_UNICODE *s = self->str;
3054    int status = 0;
3055
3056    while (len-- > 0) {
3057	register Py_UNICODE ch;
3058
3059	ch = Py_UNICODE_TOLOWER(*s);
3060	if (ch != *s) {
3061            status = 1;
3062	    *s = ch;
3063	}
3064        s++;
3065    }
3066
3067    return status;
3068}
3069
3070static
3071int fixswapcase(PyUnicodeObject *self)
3072{
3073    int len = self->length;
3074    Py_UNICODE *s = self->str;
3075    int status = 0;
3076
3077    while (len-- > 0) {
3078        if (Py_UNICODE_ISUPPER(*s)) {
3079            *s = Py_UNICODE_TOLOWER(*s);
3080            status = 1;
3081        } else if (Py_UNICODE_ISLOWER(*s)) {
3082            *s = Py_UNICODE_TOUPPER(*s);
3083            status = 1;
3084        }
3085        s++;
3086    }
3087
3088    return status;
3089}
3090
3091static
3092int fixcapitalize(PyUnicodeObject *self)
3093{
3094    int len = self->length;
3095    Py_UNICODE *s = self->str;
3096    int status = 0;
3097
3098    if (len == 0)
3099	return 0;
3100    if (Py_UNICODE_ISLOWER(*s)) {
3101	*s = Py_UNICODE_TOUPPER(*s);
3102	status = 1;
3103    }
3104    s++;
3105    while (--len > 0) {
3106        if (Py_UNICODE_ISUPPER(*s)) {
3107            *s = Py_UNICODE_TOLOWER(*s);
3108            status = 1;
3109        }
3110        s++;
3111    }
3112    return status;
3113}
3114
3115static
3116int fixtitle(PyUnicodeObject *self)
3117{
3118    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3119    register Py_UNICODE *e;
3120    int previous_is_cased;
3121
3122    /* Shortcut for single character strings */
3123    if (PyUnicode_GET_SIZE(self) == 1) {
3124	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3125	if (*p != ch) {
3126	    *p = ch;
3127	    return 1;
3128	}
3129	else
3130	    return 0;
3131    }
3132
3133    e = p + PyUnicode_GET_SIZE(self);
3134    previous_is_cased = 0;
3135    for (; p < e; p++) {
3136	register const Py_UNICODE ch = *p;
3137
3138	if (previous_is_cased)
3139	    *p = Py_UNICODE_TOLOWER(ch);
3140	else
3141	    *p = Py_UNICODE_TOTITLE(ch);
3142
3143	if (Py_UNICODE_ISLOWER(ch) ||
3144	    Py_UNICODE_ISUPPER(ch) ||
3145	    Py_UNICODE_ISTITLE(ch))
3146	    previous_is_cased = 1;
3147	else
3148	    previous_is_cased = 0;
3149    }
3150    return 1;
3151}
3152
3153PyObject *PyUnicode_Join(PyObject *separator,
3154			 PyObject *seq)
3155{
3156    Py_UNICODE *sep;
3157    int seplen;
3158    PyUnicodeObject *res = NULL;
3159    int reslen = 0;
3160    Py_UNICODE *p;
3161    int sz = 100;
3162    int i;
3163    PyObject *it;
3164
3165    it = PyObject_GetIter(seq);
3166    if (it == NULL)
3167        return NULL;
3168
3169    if (separator == NULL) {
3170	Py_UNICODE blank = ' ';
3171	sep = &blank;
3172	seplen = 1;
3173    }
3174    else {
3175	separator = PyUnicode_FromObject(separator);
3176	if (separator == NULL)
3177	    goto onError;
3178	sep = PyUnicode_AS_UNICODE(separator);
3179	seplen = PyUnicode_GET_SIZE(separator);
3180    }
3181
3182    res = _PyUnicode_New(sz);
3183    if (res == NULL)
3184	goto onError;
3185    p = PyUnicode_AS_UNICODE(res);
3186    reslen = 0;
3187
3188    for (i = 0; ; ++i) {
3189	int itemlen;
3190	PyObject *item = PyIter_Next(it);
3191	if (item == NULL) {
3192	    if (PyErr_Occurred())
3193		goto onError;
3194	    break;
3195	}
3196	if (!PyUnicode_Check(item)) {
3197	    PyObject *v;
3198	    if (!PyString_Check(item)) {
3199		PyErr_Format(PyExc_TypeError,
3200			     "sequence item %i: expected string or Unicode,"
3201			     " %.80s found",
3202			     i, item->ob_type->tp_name);
3203		Py_DECREF(item);
3204		goto onError;
3205	    }
3206	    v = PyUnicode_FromObject(item);
3207	    Py_DECREF(item);
3208	    item = v;
3209	    if (item == NULL)
3210		goto onError;
3211	}
3212	itemlen = PyUnicode_GET_SIZE(item);
3213	while (reslen + itemlen + seplen >= sz) {
3214	    if (_PyUnicode_Resize(&res, sz*2)) {
3215		Py_DECREF(item);
3216		goto onError;
3217	    }
3218	    sz *= 2;
3219	    p = PyUnicode_AS_UNICODE(res) + reslen;
3220	}
3221	if (i > 0) {
3222	    Py_UNICODE_COPY(p, sep, seplen);
3223	    p += seplen;
3224	    reslen += seplen;
3225	}
3226	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3227	p += itemlen;
3228	reslen += itemlen;
3229	Py_DECREF(item);
3230    }
3231    if (_PyUnicode_Resize(&res, reslen))
3232	goto onError;
3233
3234    Py_XDECREF(separator);
3235    Py_DECREF(it);
3236    return (PyObject *)res;
3237
3238 onError:
3239    Py_XDECREF(separator);
3240    Py_XDECREF(res);
3241    Py_DECREF(it);
3242    return NULL;
3243}
3244
3245static
3246PyUnicodeObject *pad(PyUnicodeObject *self,
3247		     int left,
3248		     int right,
3249		     Py_UNICODE fill)
3250{
3251    PyUnicodeObject *u;
3252
3253    if (left < 0)
3254        left = 0;
3255    if (right < 0)
3256        right = 0;
3257
3258    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3259        Py_INCREF(self);
3260        return self;
3261    }
3262
3263    u = _PyUnicode_New(left + self->length + right);
3264    if (u) {
3265        if (left)
3266            Py_UNICODE_FILL(u->str, fill, left);
3267        Py_UNICODE_COPY(u->str + left, self->str, self->length);
3268        if (right)
3269            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3270    }
3271
3272    return u;
3273}
3274
3275#define SPLIT_APPEND(data, left, right)					\
3276	str = PyUnicode_FromUnicode(data + left, right - left);		\
3277	if (!str)							\
3278	    goto onError;						\
3279	if (PyList_Append(list, str)) {					\
3280	    Py_DECREF(str);						\
3281	    goto onError;						\
3282	}								\
3283        else								\
3284            Py_DECREF(str);
3285
3286static
3287PyObject *split_whitespace(PyUnicodeObject *self,
3288			   PyObject *list,
3289			   int maxcount)
3290{
3291    register int i;
3292    register int j;
3293    int len = self->length;
3294    PyObject *str;
3295
3296    for (i = j = 0; i < len; ) {
3297	/* find a token */
3298	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3299	    i++;
3300	j = i;
3301	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3302	    i++;
3303	if (j < i) {
3304	    if (maxcount-- <= 0)
3305		break;
3306	    SPLIT_APPEND(self->str, j, i);
3307	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3308		i++;
3309	    j = i;
3310	}
3311    }
3312    if (j < len) {
3313	SPLIT_APPEND(self->str, j, len);
3314    }
3315    return list;
3316
3317 onError:
3318    Py_DECREF(list);
3319    return NULL;
3320}
3321
3322PyObject *PyUnicode_Splitlines(PyObject *string,
3323			       int keepends)
3324{
3325    register int i;
3326    register int j;
3327    int len;
3328    PyObject *list;
3329    PyObject *str;
3330    Py_UNICODE *data;
3331
3332    string = PyUnicode_FromObject(string);
3333    if (string == NULL)
3334	return NULL;
3335    data = PyUnicode_AS_UNICODE(string);
3336    len = PyUnicode_GET_SIZE(string);
3337
3338    list = PyList_New(0);
3339    if (!list)
3340        goto onError;
3341
3342    for (i = j = 0; i < len; ) {
3343	int eol;
3344
3345	/* Find a line and append it */
3346	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3347	    i++;
3348
3349	/* Skip the line break reading CRLF as one line break */
3350	eol = i;
3351	if (i < len) {
3352	    if (data[i] == '\r' && i + 1 < len &&
3353		data[i+1] == '\n')
3354		i += 2;
3355	    else
3356		i++;
3357	    if (keepends)
3358		eol = i;
3359	}
3360	SPLIT_APPEND(data, j, eol);
3361	j = i;
3362    }
3363    if (j < len) {
3364	SPLIT_APPEND(data, j, len);
3365    }
3366
3367    Py_DECREF(string);
3368    return list;
3369
3370 onError:
3371    Py_DECREF(list);
3372    Py_DECREF(string);
3373    return NULL;
3374}
3375
3376static
3377PyObject *split_char(PyUnicodeObject *self,
3378		     PyObject *list,
3379		     Py_UNICODE ch,
3380		     int maxcount)
3381{
3382    register int i;
3383    register int j;
3384    int len = self->length;
3385    PyObject *str;
3386
3387    for (i = j = 0; i < len; ) {
3388	if (self->str[i] == ch) {
3389	    if (maxcount-- <= 0)
3390		break;
3391	    SPLIT_APPEND(self->str, j, i);
3392	    i = j = i + 1;
3393	} else
3394	    i++;
3395    }
3396    if (j <= len) {
3397	SPLIT_APPEND(self->str, j, len);
3398    }
3399    return list;
3400
3401 onError:
3402    Py_DECREF(list);
3403    return NULL;
3404}
3405
3406static
3407PyObject *split_substring(PyUnicodeObject *self,
3408			  PyObject *list,
3409			  PyUnicodeObject *substring,
3410			  int maxcount)
3411{
3412    register int i;
3413    register int j;
3414    int len = self->length;
3415    int sublen = substring->length;
3416    PyObject *str;
3417
3418    for (i = j = 0; i <= len - sublen; ) {
3419	if (Py_UNICODE_MATCH(self, i, substring)) {
3420	    if (maxcount-- <= 0)
3421		break;
3422	    SPLIT_APPEND(self->str, j, i);
3423	    i = j = i + sublen;
3424	} else
3425	    i++;
3426    }
3427    if (j <= len) {
3428	SPLIT_APPEND(self->str, j, len);
3429    }
3430    return list;
3431
3432 onError:
3433    Py_DECREF(list);
3434    return NULL;
3435}
3436
3437#undef SPLIT_APPEND
3438
3439static
3440PyObject *split(PyUnicodeObject *self,
3441		PyUnicodeObject *substring,
3442		int maxcount)
3443{
3444    PyObject *list;
3445
3446    if (maxcount < 0)
3447        maxcount = INT_MAX;
3448
3449    list = PyList_New(0);
3450    if (!list)
3451        return NULL;
3452
3453    if (substring == NULL)
3454	return split_whitespace(self,list,maxcount);
3455
3456    else if (substring->length == 1)
3457	return split_char(self,list,substring->str[0],maxcount);
3458
3459    else if (substring->length == 0) {
3460	Py_DECREF(list);
3461	PyErr_SetString(PyExc_ValueError, "empty separator");
3462	return NULL;
3463    }
3464    else
3465	return split_substring(self,list,substring,maxcount);
3466}
3467
3468static
3469PyObject *strip(PyUnicodeObject *self,
3470		int left,
3471		int right)
3472{
3473    Py_UNICODE *p = self->str;
3474    int start = 0;
3475    int end = self->length;
3476
3477    if (left)
3478        while (start < end && Py_UNICODE_ISSPACE(p[start]))
3479            start++;
3480
3481    if (right)
3482        while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3483            end--;
3484
3485    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
3486        /* couldn't strip anything off, return original string */
3487        Py_INCREF(self);
3488        return (PyObject*) self;
3489    }
3490
3491    return (PyObject*) PyUnicode_FromUnicode(
3492        self->str + start,
3493        end - start
3494        );
3495}
3496
3497static
3498PyObject *replace(PyUnicodeObject *self,
3499		  PyUnicodeObject *str1,
3500		  PyUnicodeObject *str2,
3501		  int maxcount)
3502{
3503    PyUnicodeObject *u;
3504
3505    if (maxcount < 0)
3506	maxcount = INT_MAX;
3507
3508    if (str1->length == 1 && str2->length == 1) {
3509        int i;
3510
3511        /* replace characters */
3512        if (!findchar(self->str, self->length, str1->str[0]) &&
3513            PyUnicode_CheckExact(self)) {
3514            /* nothing to replace, return original string */
3515            Py_INCREF(self);
3516            u = self;
3517        } else {
3518	    Py_UNICODE u1 = str1->str[0];
3519	    Py_UNICODE u2 = str2->str[0];
3520
3521            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3522                NULL,
3523                self->length
3524                );
3525            if (u != NULL) {
3526		Py_UNICODE_COPY(u->str, self->str,
3527				self->length);
3528                for (i = 0; i < u->length; i++)
3529                    if (u->str[i] == u1) {
3530                        if (--maxcount < 0)
3531                            break;
3532                        u->str[i] = u2;
3533                    }
3534        }
3535        }
3536
3537    } else {
3538        int n, i;
3539        Py_UNICODE *p;
3540
3541        /* replace strings */
3542        n = count(self, 0, self->length, str1);
3543        if (n > maxcount)
3544            n = maxcount;
3545        if (n == 0 && PyUnicode_CheckExact(self)) {
3546            /* nothing to replace, return original string */
3547            Py_INCREF(self);
3548            u = self;
3549        } else {
3550            u = _PyUnicode_New(
3551                self->length + n * (str2->length - str1->length));
3552            if (u) {
3553                i = 0;
3554                p = u->str;
3555                while (i <= self->length - str1->length)
3556                    if (Py_UNICODE_MATCH(self, i, str1)) {
3557                        /* replace string segment */
3558                        Py_UNICODE_COPY(p, str2->str, str2->length);
3559                        p += str2->length;
3560                        i += str1->length;
3561                        if (--n <= 0) {
3562                            /* copy remaining part */
3563                            Py_UNICODE_COPY(p, self->str+i, self->length-i);
3564                            break;
3565                        }
3566                    } else
3567                        *p++ = self->str[i++];
3568            }
3569        }
3570    }
3571
3572    return (PyObject *) u;
3573}
3574
3575/* --- Unicode Object Methods --------------------------------------------- */
3576
3577static char title__doc__[] =
3578"S.title() -> unicode\n\
3579\n\
3580Return a titlecased version of S, i.e. words start with title case\n\
3581characters, all remaining cased characters have lower case.";
3582
3583static PyObject*
3584unicode_title(PyUnicodeObject *self)
3585{
3586    return fixup(self, fixtitle);
3587}
3588
3589static char capitalize__doc__[] =
3590"S.capitalize() -> unicode\n\
3591\n\
3592Return a capitalized version of S, i.e. make the first character\n\
3593have upper case.";
3594
3595static PyObject*
3596unicode_capitalize(PyUnicodeObject *self)
3597{
3598    return fixup(self, fixcapitalize);
3599}
3600
3601#if 0
3602static char capwords__doc__[] =
3603"S.capwords() -> unicode\n\
3604\n\
3605Apply .capitalize() to all words in S and return the result with\n\
3606normalized whitespace (all whitespace strings are replaced by ' ').";
3607
3608static PyObject*
3609unicode_capwords(PyUnicodeObject *self)
3610{
3611    PyObject *list;
3612    PyObject *item;
3613    int i;
3614
3615    /* Split into words */
3616    list = split(self, NULL, -1);
3617    if (!list)
3618        return NULL;
3619
3620    /* Capitalize each word */
3621    for (i = 0; i < PyList_GET_SIZE(list); i++) {
3622        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3623		     fixcapitalize);
3624        if (item == NULL)
3625            goto onError;
3626        Py_DECREF(PyList_GET_ITEM(list, i));
3627        PyList_SET_ITEM(list, i, item);
3628    }
3629
3630    /* Join the words to form a new string */
3631    item = PyUnicode_Join(NULL, list);
3632
3633onError:
3634    Py_DECREF(list);
3635    return (PyObject *)item;
3636}
3637#endif
3638
3639static char center__doc__[] =
3640"S.center(width) -> unicode\n\
3641\n\
3642Return S centered in a Unicode string of length width. Padding is done\n\
3643using spaces.";
3644
3645static PyObject *
3646unicode_center(PyUnicodeObject *self, PyObject *args)
3647{
3648    int marg, left;
3649    int width;
3650
3651    if (!PyArg_ParseTuple(args, "i:center", &width))
3652        return NULL;
3653
3654    if (self->length >= width && PyUnicode_CheckExact(self)) {
3655        Py_INCREF(self);
3656        return (PyObject*) self;
3657    }
3658
3659    marg = width - self->length;
3660    left = marg / 2 + (marg & width & 1);
3661
3662    return (PyObject*) pad(self, left, marg - left, ' ');
3663}
3664
3665#if 0
3666
3667/* This code should go into some future Unicode collation support
3668   module. The basic comparison should compare ordinals on a naive
3669   basis (this is what Java does and thus JPython too). */
3670
3671/* speedy UTF-16 code point order comparison */
3672/* gleaned from: */
3673/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3674
3675static short utf16Fixup[32] =
3676{
3677    0, 0, 0, 0, 0, 0, 0, 0,
3678    0, 0, 0, 0, 0, 0, 0, 0,
3679    0, 0, 0, 0, 0, 0, 0, 0,
3680    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3681};
3682
3683static int
3684unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3685{
3686    int len1, len2;
3687
3688    Py_UNICODE *s1 = str1->str;
3689    Py_UNICODE *s2 = str2->str;
3690
3691    len1 = str1->length;
3692    len2 = str2->length;
3693
3694    while (len1 > 0 && len2 > 0) {
3695        Py_UNICODE c1, c2;
3696
3697        c1 = *s1++;
3698        c2 = *s2++;
3699
3700	if (c1 > (1<<11) * 26)
3701	    c1 += utf16Fixup[c1>>11];
3702	if (c2 > (1<<11) * 26)
3703            c2 += utf16Fixup[c2>>11];
3704        /* now c1 and c2 are in UTF-32-compatible order */
3705
3706        if (c1 != c2)
3707            return (c1 < c2) ? -1 : 1;
3708
3709        len1--; len2--;
3710    }
3711
3712    return (len1 < len2) ? -1 : (len1 != len2);
3713}
3714
3715#else
3716
3717static int
3718unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3719{
3720    register int len1, len2;
3721
3722    Py_UNICODE *s1 = str1->str;
3723    Py_UNICODE *s2 = str2->str;
3724
3725    len1 = str1->length;
3726    len2 = str2->length;
3727
3728    while (len1 > 0 && len2 > 0) {
3729        Py_UNICODE c1, c2;
3730
3731        c1 = *s1++;
3732        c2 = *s2++;
3733
3734        if (c1 != c2)
3735            return (c1 < c2) ? -1 : 1;
3736
3737        len1--; len2--;
3738    }
3739
3740    return (len1 < len2) ? -1 : (len1 != len2);
3741}
3742
3743#endif
3744
3745int PyUnicode_Compare(PyObject *left,
3746		      PyObject *right)
3747{
3748    PyUnicodeObject *u = NULL, *v = NULL;
3749    int result;
3750
3751    /* Coerce the two arguments */
3752    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3753    if (u == NULL)
3754	goto onError;
3755    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3756    if (v == NULL)
3757	goto onError;
3758
3759    /* Shortcut for empty or interned objects */
3760    if (v == u) {
3761	Py_DECREF(u);
3762	Py_DECREF(v);
3763	return 0;
3764    }
3765
3766    result = unicode_compare(u, v);
3767
3768    Py_DECREF(u);
3769    Py_DECREF(v);
3770    return result;
3771
3772onError:
3773    Py_XDECREF(u);
3774    Py_XDECREF(v);
3775    return -1;
3776}
3777
3778int PyUnicode_Contains(PyObject *container,
3779		       PyObject *element)
3780{
3781    PyUnicodeObject *u = NULL, *v = NULL;
3782    int result;
3783    register const Py_UNICODE *p, *e;
3784    register Py_UNICODE ch;
3785
3786    /* Coerce the two arguments */
3787    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3788    if (v == NULL) {
3789	PyErr_SetString(PyExc_TypeError,
3790	    "'in <string>' requires character as left operand");
3791	goto onError;
3792    }
3793    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3794    if (u == NULL) {
3795	Py_DECREF(v);
3796	goto onError;
3797    }
3798
3799    /* Check v in u */
3800    if (PyUnicode_GET_SIZE(v) != 1) {
3801	PyErr_SetString(PyExc_TypeError,
3802	    "'in <string>' requires character as left operand");
3803	goto onError;
3804    }
3805    ch = *PyUnicode_AS_UNICODE(v);
3806    p = PyUnicode_AS_UNICODE(u);
3807    e = p + PyUnicode_GET_SIZE(u);
3808    result = 0;
3809    while (p < e) {
3810	if (*p++ == ch) {
3811	    result = 1;
3812	    break;
3813	}
3814    }
3815
3816    Py_DECREF(u);
3817    Py_DECREF(v);
3818    return result;
3819
3820onError:
3821    Py_XDECREF(u);
3822    Py_XDECREF(v);
3823    return -1;
3824}
3825
3826/* Concat to string or Unicode object giving a new Unicode object. */
3827
3828PyObject *PyUnicode_Concat(PyObject *left,
3829			   PyObject *right)
3830{
3831    PyUnicodeObject *u = NULL, *v = NULL, *w;
3832
3833    /* Coerce the two arguments */
3834    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3835    if (u == NULL)
3836	goto onError;
3837    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3838    if (v == NULL)
3839	goto onError;
3840
3841    /* Shortcuts */
3842    if (v == unicode_empty) {
3843	Py_DECREF(v);
3844	return (PyObject *)u;
3845    }
3846    if (u == unicode_empty) {
3847	Py_DECREF(u);
3848	return (PyObject *)v;
3849    }
3850
3851    /* Concat the two Unicode strings */
3852    w = _PyUnicode_New(u->length + v->length);
3853    if (w == NULL)
3854	goto onError;
3855    Py_UNICODE_COPY(w->str, u->str, u->length);
3856    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3857
3858    Py_DECREF(u);
3859    Py_DECREF(v);
3860    return (PyObject *)w;
3861
3862onError:
3863    Py_XDECREF(u);
3864    Py_XDECREF(v);
3865    return NULL;
3866}
3867
3868static char count__doc__[] =
3869"S.count(sub[, start[, end]]) -> int\n\
3870\n\
3871Return the number of occurrences of substring sub in Unicode string\n\
3872S[start:end].  Optional arguments start and end are\n\
3873interpreted as in slice notation.";
3874
3875static PyObject *
3876unicode_count(PyUnicodeObject *self, PyObject *args)
3877{
3878    PyUnicodeObject *substring;
3879    int start = 0;
3880    int end = INT_MAX;
3881    PyObject *result;
3882
3883    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3884		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3885        return NULL;
3886
3887    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3888						(PyObject *)substring);
3889    if (substring == NULL)
3890	return NULL;
3891
3892    if (start < 0)
3893        start += self->length;
3894    if (start < 0)
3895        start = 0;
3896    if (end > self->length)
3897        end = self->length;
3898    if (end < 0)
3899        end += self->length;
3900    if (end < 0)
3901        end = 0;
3902
3903    result = PyInt_FromLong((long) count(self, start, end, substring));
3904
3905    Py_DECREF(substring);
3906    return result;
3907}
3908
3909static char encode__doc__[] =
3910"S.encode([encoding[,errors]]) -> string\n\
3911\n\
3912Return an encoded string version of S. Default encoding is the current\n\
3913default string encoding. errors may be given to set a different error\n\
3914handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3915a ValueError. Other possible values are 'ignore' and 'replace'.";
3916
3917static PyObject *
3918unicode_encode(PyUnicodeObject *self, PyObject *args)
3919{
3920    char *encoding = NULL;
3921    char *errors = NULL;
3922    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3923        return NULL;
3924    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3925}
3926
3927static char expandtabs__doc__[] =
3928"S.expandtabs([tabsize]) -> unicode\n\
3929\n\
3930Return a copy of S where all tab characters are expanded using spaces.\n\
3931If tabsize is not given, a tab size of 8 characters is assumed.";
3932
3933static PyObject*
3934unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3935{
3936    Py_UNICODE *e;
3937    Py_UNICODE *p;
3938    Py_UNICODE *q;
3939    int i, j;
3940    PyUnicodeObject *u;
3941    int tabsize = 8;
3942
3943    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3944	return NULL;
3945
3946    /* First pass: determine size of output string */
3947    i = j = 0;
3948    e = self->str + self->length;
3949    for (p = self->str; p < e; p++)
3950        if (*p == '\t') {
3951	    if (tabsize > 0)
3952		j += tabsize - (j % tabsize);
3953	}
3954        else {
3955            j++;
3956            if (*p == '\n' || *p == '\r') {
3957                i += j;
3958                j = 0;
3959            }
3960        }
3961
3962    /* Second pass: create output string and fill it */
3963    u = _PyUnicode_New(i + j);
3964    if (!u)
3965        return NULL;
3966
3967    j = 0;
3968    q = u->str;
3969
3970    for (p = self->str; p < e; p++)
3971        if (*p == '\t') {
3972	    if (tabsize > 0) {
3973		i = tabsize - (j % tabsize);
3974		j += i;
3975		while (i--)
3976		    *q++ = ' ';
3977	    }
3978	}
3979	else {
3980            j++;
3981	    *q++ = *p;
3982            if (*p == '\n' || *p == '\r')
3983                j = 0;
3984        }
3985
3986    return (PyObject*) u;
3987}
3988
3989static char find__doc__[] =
3990"S.find(sub [,start [,end]]) -> int\n\
3991\n\
3992Return the lowest index in S where substring sub is found,\n\
3993such that sub is contained within s[start,end].  Optional\n\
3994arguments start and end are interpreted as in slice notation.\n\
3995\n\
3996Return -1 on failure.";
3997
3998static PyObject *
3999unicode_find(PyUnicodeObject *self, PyObject *args)
4000{
4001    PyUnicodeObject *substring;
4002    int start = 0;
4003    int end = INT_MAX;
4004    PyObject *result;
4005
4006    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4007		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4008        return NULL;
4009    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4010						(PyObject *)substring);
4011    if (substring == NULL)
4012	return NULL;
4013
4014    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4015
4016    Py_DECREF(substring);
4017    return result;
4018}
4019
4020static PyObject *
4021unicode_getitem(PyUnicodeObject *self, int index)
4022{
4023    if (index < 0 || index >= self->length) {
4024        PyErr_SetString(PyExc_IndexError, "string index out of range");
4025        return NULL;
4026    }
4027
4028    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4029}
4030
4031static long
4032unicode_hash(PyUnicodeObject *self)
4033{
4034    /* Since Unicode objects compare equal to their ASCII string
4035       counterparts, they should use the individual character values
4036       as basis for their hash value.  This is needed to assure that
4037       strings and Unicode objects behave in the same way as
4038       dictionary keys. */
4039
4040    register int len;
4041    register Py_UNICODE *p;
4042    register long x;
4043
4044    if (self->hash != -1)
4045	return self->hash;
4046    len = PyUnicode_GET_SIZE(self);
4047    p = PyUnicode_AS_UNICODE(self);
4048    x = *p << 7;
4049    while (--len >= 0)
4050	x = (1000003*x) ^ *p++;
4051    x ^= PyUnicode_GET_SIZE(self);
4052    if (x == -1)
4053	x = -2;
4054    self->hash = x;
4055    return x;
4056}
4057
4058static char index__doc__[] =
4059"S.index(sub [,start [,end]]) -> int\n\
4060\n\
4061Like S.find() but raise ValueError when the substring is not found.";
4062
4063static PyObject *
4064unicode_index(PyUnicodeObject *self, PyObject *args)
4065{
4066    int result;
4067    PyUnicodeObject *substring;
4068    int start = 0;
4069    int end = INT_MAX;
4070
4071    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4072		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4073        return NULL;
4074
4075    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4076						(PyObject *)substring);
4077    if (substring == NULL)
4078	return NULL;
4079
4080    result = findstring(self, substring, start, end, 1);
4081
4082    Py_DECREF(substring);
4083    if (result < 0) {
4084        PyErr_SetString(PyExc_ValueError, "substring not found");
4085        return NULL;
4086    }
4087    return PyInt_FromLong(result);
4088}
4089
4090static char islower__doc__[] =
4091"S.islower() -> int\n\
4092\n\
4093Return 1 if  all cased characters in S are lowercase and there is\n\
4094at least one cased character in S, 0 otherwise.";
4095
4096static PyObject*
4097unicode_islower(PyUnicodeObject *self)
4098{
4099    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4100    register const Py_UNICODE *e;
4101    int cased;
4102
4103    /* Shortcut for single character strings */
4104    if (PyUnicode_GET_SIZE(self) == 1)
4105	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4106
4107    /* Special case for empty strings */
4108    if (PyString_GET_SIZE(self) == 0)
4109	return PyInt_FromLong(0);
4110
4111    e = p + PyUnicode_GET_SIZE(self);
4112    cased = 0;
4113    for (; p < e; p++) {
4114	register const Py_UNICODE ch = *p;
4115
4116	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4117	    return PyInt_FromLong(0);
4118	else if (!cased && Py_UNICODE_ISLOWER(ch))
4119	    cased = 1;
4120    }
4121    return PyInt_FromLong(cased);
4122}
4123
4124static char isupper__doc__[] =
4125"S.isupper() -> int\n\
4126\n\
4127Return 1 if  all cased characters in S are uppercase and there is\n\
4128at least one cased character in S, 0 otherwise.";
4129
4130static PyObject*
4131unicode_isupper(PyUnicodeObject *self)
4132{
4133    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4134    register const Py_UNICODE *e;
4135    int cased;
4136
4137    /* Shortcut for single character strings */
4138    if (PyUnicode_GET_SIZE(self) == 1)
4139	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4140
4141    /* Special case for empty strings */
4142    if (PyString_GET_SIZE(self) == 0)
4143	return PyInt_FromLong(0);
4144
4145    e = p + PyUnicode_GET_SIZE(self);
4146    cased = 0;
4147    for (; p < e; p++) {
4148	register const Py_UNICODE ch = *p;
4149
4150	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4151	    return PyInt_FromLong(0);
4152	else if (!cased && Py_UNICODE_ISUPPER(ch))
4153	    cased = 1;
4154    }
4155    return PyInt_FromLong(cased);
4156}
4157
4158static char istitle__doc__[] =
4159"S.istitle() -> int\n\
4160\n\
4161Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4162may only follow uncased characters and lowercase characters only cased\n\
4163ones. Return 0 otherwise.";
4164
4165static PyObject*
4166unicode_istitle(PyUnicodeObject *self)
4167{
4168    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4169    register const Py_UNICODE *e;
4170    int cased, previous_is_cased;
4171
4172    /* Shortcut for single character strings */
4173    if (PyUnicode_GET_SIZE(self) == 1)
4174	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4175			      (Py_UNICODE_ISUPPER(*p) != 0));
4176
4177    /* Special case for empty strings */
4178    if (PyString_GET_SIZE(self) == 0)
4179	return PyInt_FromLong(0);
4180
4181    e = p + PyUnicode_GET_SIZE(self);
4182    cased = 0;
4183    previous_is_cased = 0;
4184    for (; p < e; p++) {
4185	register const Py_UNICODE ch = *p;
4186
4187	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4188	    if (previous_is_cased)
4189		return PyInt_FromLong(0);
4190	    previous_is_cased = 1;
4191	    cased = 1;
4192	}
4193	else if (Py_UNICODE_ISLOWER(ch)) {
4194	    if (!previous_is_cased)
4195		return PyInt_FromLong(0);
4196	    previous_is_cased = 1;
4197	    cased = 1;
4198	}
4199	else
4200	    previous_is_cased = 0;
4201    }
4202    return PyInt_FromLong(cased);
4203}
4204
4205static char isspace__doc__[] =
4206"S.isspace() -> int\n\
4207\n\
4208Return 1 if there are only whitespace characters in S,\n\
42090 otherwise.";
4210
4211static PyObject*
4212unicode_isspace(PyUnicodeObject *self)
4213{
4214    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4215    register const Py_UNICODE *e;
4216
4217    /* Shortcut for single character strings */
4218    if (PyUnicode_GET_SIZE(self) == 1 &&
4219	Py_UNICODE_ISSPACE(*p))
4220	return PyInt_FromLong(1);
4221
4222    /* Special case for empty strings */
4223    if (PyString_GET_SIZE(self) == 0)
4224	return PyInt_FromLong(0);
4225
4226    e = p + PyUnicode_GET_SIZE(self);
4227    for (; p < e; p++) {
4228	if (!Py_UNICODE_ISSPACE(*p))
4229	    return PyInt_FromLong(0);
4230    }
4231    return PyInt_FromLong(1);
4232}
4233
4234static char isalpha__doc__[] =
4235"S.isalpha() -> int\n\
4236\n\
4237Return 1 if  all characters in S are alphabetic\n\
4238and there is at least one character in S, 0 otherwise.";
4239
4240static PyObject*
4241unicode_isalpha(PyUnicodeObject *self)
4242{
4243    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4244    register const Py_UNICODE *e;
4245
4246    /* Shortcut for single character strings */
4247    if (PyUnicode_GET_SIZE(self) == 1 &&
4248	Py_UNICODE_ISALPHA(*p))
4249	return PyInt_FromLong(1);
4250
4251    /* Special case for empty strings */
4252    if (PyString_GET_SIZE(self) == 0)
4253	return PyInt_FromLong(0);
4254
4255    e = p + PyUnicode_GET_SIZE(self);
4256    for (; p < e; p++) {
4257	if (!Py_UNICODE_ISALPHA(*p))
4258	    return PyInt_FromLong(0);
4259    }
4260    return PyInt_FromLong(1);
4261}
4262
4263static char isalnum__doc__[] =
4264"S.isalnum() -> int\n\
4265\n\
4266Return 1 if  all characters in S are alphanumeric\n\
4267and there is at least one character in S, 0 otherwise.";
4268
4269static PyObject*
4270unicode_isalnum(PyUnicodeObject *self)
4271{
4272    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4273    register const Py_UNICODE *e;
4274
4275    /* Shortcut for single character strings */
4276    if (PyUnicode_GET_SIZE(self) == 1 &&
4277	Py_UNICODE_ISALNUM(*p))
4278	return PyInt_FromLong(1);
4279
4280    /* Special case for empty strings */
4281    if (PyString_GET_SIZE(self) == 0)
4282	return PyInt_FromLong(0);
4283
4284    e = p + PyUnicode_GET_SIZE(self);
4285    for (; p < e; p++) {
4286	if (!Py_UNICODE_ISALNUM(*p))
4287	    return PyInt_FromLong(0);
4288    }
4289    return PyInt_FromLong(1);
4290}
4291
4292static char isdecimal__doc__[] =
4293"S.isdecimal() -> int\n\
4294\n\
4295Return 1 if there are only decimal characters in S,\n\
42960 otherwise.";
4297
4298static PyObject*
4299unicode_isdecimal(PyUnicodeObject *self)
4300{
4301    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4302    register const Py_UNICODE *e;
4303
4304    /* Shortcut for single character strings */
4305    if (PyUnicode_GET_SIZE(self) == 1 &&
4306	Py_UNICODE_ISDECIMAL(*p))
4307	return PyInt_FromLong(1);
4308
4309    /* Special case for empty strings */
4310    if (PyString_GET_SIZE(self) == 0)
4311	return PyInt_FromLong(0);
4312
4313    e = p + PyUnicode_GET_SIZE(self);
4314    for (; p < e; p++) {
4315	if (!Py_UNICODE_ISDECIMAL(*p))
4316	    return PyInt_FromLong(0);
4317    }
4318    return PyInt_FromLong(1);
4319}
4320
4321static char isdigit__doc__[] =
4322"S.isdigit() -> int\n\
4323\n\
4324Return 1 if there are only digit characters in S,\n\
43250 otherwise.";
4326
4327static PyObject*
4328unicode_isdigit(PyUnicodeObject *self)
4329{
4330    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4331    register const Py_UNICODE *e;
4332
4333    /* Shortcut for single character strings */
4334    if (PyUnicode_GET_SIZE(self) == 1 &&
4335	Py_UNICODE_ISDIGIT(*p))
4336	return PyInt_FromLong(1);
4337
4338    /* Special case for empty strings */
4339    if (PyString_GET_SIZE(self) == 0)
4340	return PyInt_FromLong(0);
4341
4342    e = p + PyUnicode_GET_SIZE(self);
4343    for (; p < e; p++) {
4344	if (!Py_UNICODE_ISDIGIT(*p))
4345	    return PyInt_FromLong(0);
4346    }
4347    return PyInt_FromLong(1);
4348}
4349
4350static char isnumeric__doc__[] =
4351"S.isnumeric() -> int\n\
4352\n\
4353Return 1 if there are only numeric characters in S,\n\
43540 otherwise.";
4355
4356static PyObject*
4357unicode_isnumeric(PyUnicodeObject *self)
4358{
4359    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4360    register const Py_UNICODE *e;
4361
4362    /* Shortcut for single character strings */
4363    if (PyUnicode_GET_SIZE(self) == 1 &&
4364	Py_UNICODE_ISNUMERIC(*p))
4365	return PyInt_FromLong(1);
4366
4367    /* Special case for empty strings */
4368    if (PyString_GET_SIZE(self) == 0)
4369	return PyInt_FromLong(0);
4370
4371    e = p + PyUnicode_GET_SIZE(self);
4372    for (; p < e; p++) {
4373	if (!Py_UNICODE_ISNUMERIC(*p))
4374	    return PyInt_FromLong(0);
4375    }
4376    return PyInt_FromLong(1);
4377}
4378
4379static char join__doc__[] =
4380"S.join(sequence) -> unicode\n\
4381\n\
4382Return a string which is the concatenation of the strings in the\n\
4383sequence.  The separator between elements is S.";
4384
4385static PyObject*
4386unicode_join(PyObject *self, PyObject *data)
4387{
4388    return PyUnicode_Join(self, data);
4389}
4390
4391static int
4392unicode_length(PyUnicodeObject *self)
4393{
4394    return self->length;
4395}
4396
4397static char ljust__doc__[] =
4398"S.ljust(width) -> unicode\n\
4399\n\
4400Return S left justified in a Unicode string of length width. Padding is\n\
4401done using spaces.";
4402
4403static PyObject *
4404unicode_ljust(PyUnicodeObject *self, PyObject *args)
4405{
4406    int width;
4407    if (!PyArg_ParseTuple(args, "i:ljust", &width))
4408        return NULL;
4409
4410    if (self->length >= width && PyUnicode_CheckExact(self)) {
4411        Py_INCREF(self);
4412        return (PyObject*) self;
4413    }
4414
4415    return (PyObject*) pad(self, 0, width - self->length, ' ');
4416}
4417
4418static char lower__doc__[] =
4419"S.lower() -> unicode\n\
4420\n\
4421Return a copy of the string S converted to lowercase.";
4422
4423static PyObject*
4424unicode_lower(PyUnicodeObject *self)
4425{
4426    return fixup(self, fixlower);
4427}
4428
4429static char lstrip__doc__[] =
4430"S.lstrip() -> unicode\n\
4431\n\
4432Return a copy of the string S with leading whitespace removed.";
4433
4434static PyObject *
4435unicode_lstrip(PyUnicodeObject *self)
4436{
4437    return strip(self, 1, 0);
4438}
4439
4440static PyObject*
4441unicode_repeat(PyUnicodeObject *str, int len)
4442{
4443    PyUnicodeObject *u;
4444    Py_UNICODE *p;
4445    int nchars;
4446    size_t nbytes;
4447
4448    if (len < 0)
4449        len = 0;
4450
4451    if (len == 1 && PyUnicode_CheckExact(str)) {
4452        /* no repeat, return original string */
4453        Py_INCREF(str);
4454        return (PyObject*) str;
4455    }
4456
4457    /* ensure # of chars needed doesn't overflow int and # of bytes
4458     * needed doesn't overflow size_t
4459     */
4460    nchars = len * str->length;
4461    if (len && nchars / len != str->length) {
4462        PyErr_SetString(PyExc_OverflowError,
4463                        "repeated string is too long");
4464        return NULL;
4465    }
4466    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4467    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4468        PyErr_SetString(PyExc_OverflowError,
4469                        "repeated string is too long");
4470        return NULL;
4471    }
4472    u = _PyUnicode_New(nchars);
4473    if (!u)
4474        return NULL;
4475
4476    p = u->str;
4477
4478    while (len-- > 0) {
4479        Py_UNICODE_COPY(p, str->str, str->length);
4480        p += str->length;
4481    }
4482
4483    return (PyObject*) u;
4484}
4485
4486PyObject *PyUnicode_Replace(PyObject *obj,
4487			    PyObject *subobj,
4488			    PyObject *replobj,
4489			    int maxcount)
4490{
4491    PyObject *self;
4492    PyObject *str1;
4493    PyObject *str2;
4494    PyObject *result;
4495
4496    self = PyUnicode_FromObject(obj);
4497    if (self == NULL)
4498	return NULL;
4499    str1 = PyUnicode_FromObject(subobj);
4500    if (str1 == NULL) {
4501	Py_DECREF(self);
4502	return NULL;
4503    }
4504    str2 = PyUnicode_FromObject(replobj);
4505    if (str2 == NULL) {
4506	Py_DECREF(self);
4507	Py_DECREF(str1);
4508	return NULL;
4509    }
4510    result = replace((PyUnicodeObject *)self,
4511		     (PyUnicodeObject *)str1,
4512		     (PyUnicodeObject *)str2,
4513		     maxcount);
4514    Py_DECREF(self);
4515    Py_DECREF(str1);
4516    Py_DECREF(str2);
4517    return result;
4518}
4519
4520static char replace__doc__[] =
4521"S.replace (old, new[, maxsplit]) -> unicode\n\
4522\n\
4523Return a copy of S with all occurrences of substring\n\
4524old replaced by new.  If the optional argument maxsplit is\n\
4525given, only the first maxsplit occurrences are replaced.";
4526
4527static PyObject*
4528unicode_replace(PyUnicodeObject *self, PyObject *args)
4529{
4530    PyUnicodeObject *str1;
4531    PyUnicodeObject *str2;
4532    int maxcount = -1;
4533    PyObject *result;
4534
4535    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4536        return NULL;
4537    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4538    if (str1 == NULL)
4539	return NULL;
4540    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4541    if (str2 == NULL)
4542	return NULL;
4543
4544    result = replace(self, str1, str2, maxcount);
4545
4546    Py_DECREF(str1);
4547    Py_DECREF(str2);
4548    return result;
4549}
4550
4551static
4552PyObject *unicode_repr(PyObject *unicode)
4553{
4554    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4555				PyUnicode_GET_SIZE(unicode),
4556				1);
4557}
4558
4559static char rfind__doc__[] =
4560"S.rfind(sub [,start [,end]]) -> int\n\
4561\n\
4562Return the highest index in S where substring sub is found,\n\
4563such that sub is contained within s[start,end].  Optional\n\
4564arguments start and end are interpreted as in slice notation.\n\
4565\n\
4566Return -1 on failure.";
4567
4568static PyObject *
4569unicode_rfind(PyUnicodeObject *self, PyObject *args)
4570{
4571    PyUnicodeObject *substring;
4572    int start = 0;
4573    int end = INT_MAX;
4574    PyObject *result;
4575
4576    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4577		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4578        return NULL;
4579    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4580						(PyObject *)substring);
4581    if (substring == NULL)
4582	return NULL;
4583
4584    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4585
4586    Py_DECREF(substring);
4587    return result;
4588}
4589
4590static char rindex__doc__[] =
4591"S.rindex(sub [,start [,end]]) -> int\n\
4592\n\
4593Like S.rfind() but raise ValueError when the substring is not found.";
4594
4595static PyObject *
4596unicode_rindex(PyUnicodeObject *self, PyObject *args)
4597{
4598    int result;
4599    PyUnicodeObject *substring;
4600    int start = 0;
4601    int end = INT_MAX;
4602
4603    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4604		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4605        return NULL;
4606    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4607						(PyObject *)substring);
4608    if (substring == NULL)
4609	return NULL;
4610
4611    result = findstring(self, substring, start, end, -1);
4612
4613    Py_DECREF(substring);
4614    if (result < 0) {
4615        PyErr_SetString(PyExc_ValueError, "substring not found");
4616        return NULL;
4617    }
4618    return PyInt_FromLong(result);
4619}
4620
4621static char rjust__doc__[] =
4622"S.rjust(width) -> unicode\n\
4623\n\
4624Return S right justified in a Unicode string of length width. Padding is\n\
4625done using spaces.";
4626
4627static PyObject *
4628unicode_rjust(PyUnicodeObject *self, PyObject *args)
4629{
4630    int width;
4631    if (!PyArg_ParseTuple(args, "i:rjust", &width))
4632        return NULL;
4633
4634    if (self->length >= width && PyUnicode_CheckExact(self)) {
4635        Py_INCREF(self);
4636        return (PyObject*) self;
4637    }
4638
4639    return (PyObject*) pad(self, width - self->length, 0, ' ');
4640}
4641
4642static char rstrip__doc__[] =
4643"S.rstrip() -> unicode\n\
4644\n\
4645Return a copy of the string S with trailing whitespace removed.";
4646
4647static PyObject *
4648unicode_rstrip(PyUnicodeObject *self)
4649{
4650    return strip(self, 0, 1);
4651}
4652
4653static PyObject*
4654unicode_slice(PyUnicodeObject *self, int start, int end)
4655{
4656    /* standard clamping */
4657    if (start < 0)
4658        start = 0;
4659    if (end < 0)
4660        end = 0;
4661    if (end > self->length)
4662        end = self->length;
4663    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
4664        /* full slice, return original string */
4665        Py_INCREF(self);
4666        return (PyObject*) self;
4667    }
4668    if (start > end)
4669        start = end;
4670    /* copy slice */
4671    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4672					     end - start);
4673}
4674
4675PyObject *PyUnicode_Split(PyObject *s,
4676			  PyObject *sep,
4677			  int maxsplit)
4678{
4679    PyObject *result;
4680
4681    s = PyUnicode_FromObject(s);
4682    if (s == NULL)
4683	return NULL;
4684    if (sep != NULL) {
4685	sep = PyUnicode_FromObject(sep);
4686	if (sep == NULL) {
4687	    Py_DECREF(s);
4688	    return NULL;
4689	}
4690    }
4691
4692    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4693
4694    Py_DECREF(s);
4695    Py_XDECREF(sep);
4696    return result;
4697}
4698
4699static char split__doc__[] =
4700"S.split([sep [,maxsplit]]) -> list of strings\n\
4701\n\
4702Return a list of the words in S, using sep as the\n\
4703delimiter string.  If maxsplit is given, at most maxsplit\n\
4704splits are done. If sep is not specified, any whitespace string\n\
4705is a separator.";
4706
4707static PyObject*
4708unicode_split(PyUnicodeObject *self, PyObject *args)
4709{
4710    PyObject *substring = Py_None;
4711    int maxcount = -1;
4712
4713    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4714        return NULL;
4715
4716    if (substring == Py_None)
4717	return split(self, NULL, maxcount);
4718    else if (PyUnicode_Check(substring))
4719	return split(self, (PyUnicodeObject *)substring, maxcount);
4720    else
4721	return PyUnicode_Split((PyObject *)self, substring, maxcount);
4722}
4723
4724static char splitlines__doc__[] =
4725"S.splitlines([keepends]]) -> list of strings\n\
4726\n\
4727Return a list of the lines in S, breaking at line boundaries.\n\
4728Line breaks are not included in the resulting list unless keepends\n\
4729is given and true.";
4730
4731static PyObject*
4732unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4733{
4734    int keepends = 0;
4735
4736    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4737        return NULL;
4738
4739    return PyUnicode_Splitlines((PyObject *)self, keepends);
4740}
4741
4742static
4743PyObject *unicode_str(PyUnicodeObject *self)
4744{
4745    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4746}
4747
4748static char strip__doc__[] =
4749"S.strip() -> unicode\n\
4750\n\
4751Return a copy of S with leading and trailing whitespace removed.";
4752
4753static PyObject *
4754unicode_strip(PyUnicodeObject *self)
4755{
4756    return strip(self, 1, 1);
4757}
4758
4759static char swapcase__doc__[] =
4760"S.swapcase() -> unicode\n\
4761\n\
4762Return a copy of S with uppercase characters converted to lowercase\n\
4763and vice versa.";
4764
4765static PyObject*
4766unicode_swapcase(PyUnicodeObject *self)
4767{
4768    return fixup(self, fixswapcase);
4769}
4770
4771static char translate__doc__[] =
4772"S.translate(table) -> unicode\n\
4773\n\
4774Return a copy of the string S, where all characters have been mapped\n\
4775through the given translation table, which must be a mapping of\n\
4776Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4777are left untouched. Characters mapped to None are deleted.";
4778
4779static PyObject*
4780unicode_translate(PyUnicodeObject *self, PyObject *table)
4781{
4782    return PyUnicode_TranslateCharmap(self->str,
4783				      self->length,
4784				      table,
4785				      "ignore");
4786}
4787
4788static char upper__doc__[] =
4789"S.upper() -> unicode\n\
4790\n\
4791Return a copy of S converted to uppercase.";
4792
4793static PyObject*
4794unicode_upper(PyUnicodeObject *self)
4795{
4796    return fixup(self, fixupper);
4797}
4798
4799#if 0
4800static char zfill__doc__[] =
4801"S.zfill(width) -> unicode\n\
4802\n\
4803Pad a numeric string x with zeros on the left, to fill a field\n\
4804of the specified width. The string x is never truncated.";
4805
4806static PyObject *
4807unicode_zfill(PyUnicodeObject *self, PyObject *args)
4808{
4809    int fill;
4810    PyUnicodeObject *u;
4811
4812    int width;
4813    if (!PyArg_ParseTuple(args, "i:zfill", &width))
4814        return NULL;
4815
4816    if (self->length >= width) {
4817        Py_INCREF(self);
4818        return (PyObject*) self;
4819    }
4820
4821    fill = width - self->length;
4822
4823    u = pad(self, fill, 0, '0');
4824
4825    if (u->str[fill] == '+' || u->str[fill] == '-') {
4826        /* move sign to beginning of string */
4827        u->str[0] = u->str[fill];
4828        u->str[fill] = '0';
4829    }
4830
4831    return (PyObject*) u;
4832}
4833#endif
4834
4835#if 0
4836static PyObject*
4837unicode_freelistsize(PyUnicodeObject *self)
4838{
4839    return PyInt_FromLong(unicode_freelist_size);
4840}
4841#endif
4842
4843static char startswith__doc__[] =
4844"S.startswith(prefix[, start[, end]]) -> int\n\
4845\n\
4846Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4847optional start, test S beginning at that position.  With optional end, stop\n\
4848comparing S at that position.";
4849
4850static PyObject *
4851unicode_startswith(PyUnicodeObject *self,
4852		   PyObject *args)
4853{
4854    PyUnicodeObject *substring;
4855    int start = 0;
4856    int end = INT_MAX;
4857    PyObject *result;
4858
4859    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4860		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4861	return NULL;
4862    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4863						(PyObject *)substring);
4864    if (substring == NULL)
4865	return NULL;
4866
4867    result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4868
4869    Py_DECREF(substring);
4870    return result;
4871}
4872
4873
4874static char endswith__doc__[] =
4875"S.endswith(suffix[, start[, end]]) -> int\n\
4876\n\
4877Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4878optional start, test S beginning at that position.  With optional end, stop\n\
4879comparing S at that position.";
4880
4881static PyObject *
4882unicode_endswith(PyUnicodeObject *self,
4883		 PyObject *args)
4884{
4885    PyUnicodeObject *substring;
4886    int start = 0;
4887    int end = INT_MAX;
4888    PyObject *result;
4889
4890    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4891		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4892	return NULL;
4893    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4894						(PyObject *)substring);
4895    if (substring == NULL)
4896	return NULL;
4897
4898    result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4899
4900    Py_DECREF(substring);
4901    return result;
4902}
4903
4904
4905static PyMethodDef unicode_methods[] = {
4906
4907    /* Order is according to common usage: often used methods should
4908       appear first, since lookup is done sequentially. */
4909
4910    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4911    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4912    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4913    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4914    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4915    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4916    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4917    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4918    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4919    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4920    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4921    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4922    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4923    {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4924/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4925    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4926    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4927    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4928    {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4929    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4930    {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4931    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4932    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4933    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4934    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4935    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4936    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4937    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4938    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4939    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4940    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4941    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4942    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4943    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4944    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
4945#if 0
4946    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4947    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
4948#endif
4949
4950#if 0
4951    /* This one is just used for debugging the implementation. */
4952    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
4953#endif
4954
4955    {NULL, NULL}
4956};
4957
4958static PySequenceMethods unicode_as_sequence = {
4959    (inquiry) unicode_length, 		/* sq_length */
4960    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
4961    (intargfunc) unicode_repeat, 	/* sq_repeat */
4962    (intargfunc) unicode_getitem, 	/* sq_item */
4963    (intintargfunc) unicode_slice, 	/* sq_slice */
4964    0, 					/* sq_ass_item */
4965    0, 					/* sq_ass_slice */
4966    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
4967};
4968
4969static int
4970unicode_buffer_getreadbuf(PyUnicodeObject *self,
4971			  int index,
4972			  const void **ptr)
4973{
4974    if (index != 0) {
4975        PyErr_SetString(PyExc_SystemError,
4976			"accessing non-existent unicode segment");
4977        return -1;
4978    }
4979    *ptr = (void *) self->str;
4980    return PyUnicode_GET_DATA_SIZE(self);
4981}
4982
4983static int
4984unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4985			   const void **ptr)
4986{
4987    PyErr_SetString(PyExc_TypeError,
4988		    "cannot use unicode as modifyable buffer");
4989    return -1;
4990}
4991
4992static int
4993unicode_buffer_getsegcount(PyUnicodeObject *self,
4994			   int *lenp)
4995{
4996    if (lenp)
4997        *lenp = PyUnicode_GET_DATA_SIZE(self);
4998    return 1;
4999}
5000
5001static int
5002unicode_buffer_getcharbuf(PyUnicodeObject *self,
5003			  int index,
5004			  const void **ptr)
5005{
5006    PyObject *str;
5007
5008    if (index != 0) {
5009        PyErr_SetString(PyExc_SystemError,
5010			"accessing non-existent unicode segment");
5011        return -1;
5012    }
5013    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5014    if (str == NULL)
5015	return -1;
5016    *ptr = (void *) PyString_AS_STRING(str);
5017    return PyString_GET_SIZE(str);
5018}
5019
5020/* Helpers for PyUnicode_Format() */
5021
5022static PyObject *
5023getnextarg(PyObject *args, int arglen, int *p_argidx)
5024{
5025    int argidx = *p_argidx;
5026    if (argidx < arglen) {
5027	(*p_argidx)++;
5028	if (arglen < 0)
5029	    return args;
5030	else
5031	    return PyTuple_GetItem(args, argidx);
5032    }
5033    PyErr_SetString(PyExc_TypeError,
5034		    "not enough arguments for format string");
5035    return NULL;
5036}
5037
5038#define F_LJUST (1<<0)
5039#define F_SIGN	(1<<1)
5040#define F_BLANK (1<<2)
5041#define F_ALT	(1<<3)
5042#define F_ZERO	(1<<4)
5043
5044static
5045int usprintf(register Py_UNICODE *buffer, char *format, ...)
5046{
5047    register int i;
5048    int len;
5049    va_list va;
5050    char *charbuffer;
5051    va_start(va, format);
5052
5053    /* First, format the string as char array, then expand to Py_UNICODE
5054       array. */
5055    charbuffer = (char *)buffer;
5056    len = vsprintf(charbuffer, format, va);
5057    for (i = len - 1; i >= 0; i--)
5058	buffer[i] = (Py_UNICODE) charbuffer[i];
5059
5060    va_end(va);
5061    return len;
5062}
5063
5064static int
5065formatfloat(Py_UNICODE *buf,
5066	    size_t buflen,
5067	    int flags,
5068	    int prec,
5069	    int type,
5070	    PyObject *v)
5071{
5072    /* fmt = '%#.' + `prec` + `type`
5073       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5074    char fmt[20];
5075    double x;
5076
5077    x = PyFloat_AsDouble(v);
5078    if (x == -1.0 && PyErr_Occurred())
5079	return -1;
5080    if (prec < 0)
5081	prec = 6;
5082    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5083	type = 'g';
5084    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5085		  (flags & F_ALT) ? "#" : "", prec, type);
5086    /* worst case length calc to ensure no buffer overrun:
5087         fmt = %#.<prec>g
5088         buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5089            for any double rep.)
5090         len = 1 + prec + 1 + 2 + 5 = 9 + prec
5091       If prec=0 the effective precision is 1 (the leading digit is
5092       always given), therefore increase by one to 10+prec. */
5093    if (buflen <= (size_t)10 + (size_t)prec) {
5094	PyErr_SetString(PyExc_OverflowError,
5095	    "formatted float is too long (precision too long?)");
5096	return -1;
5097    }
5098    return usprintf(buf, fmt, x);
5099}
5100
5101static PyObject*
5102formatlong(PyObject *val, int flags, int prec, int type)
5103{
5104	char *buf;
5105	int i, len;
5106	PyObject *str; /* temporary string object. */
5107	PyUnicodeObject *result;
5108
5109	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5110	if (!str)
5111		return NULL;
5112	result = _PyUnicode_New(len);
5113	for (i = 0; i < len; i++)
5114		result->str[i] = buf[i];
5115	result->str[len] = 0;
5116	Py_DECREF(str);
5117	return (PyObject*)result;
5118}
5119
5120static int
5121formatint(Py_UNICODE *buf,
5122	  size_t buflen,
5123	  int flags,
5124	  int prec,
5125	  int type,
5126	  PyObject *v)
5127{
5128    /* fmt = '%#.' + `prec` + 'l' + `type`
5129       worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5130       + 1 + 1 = 24*/
5131    char fmt[64]; /* plenty big enough! */
5132    long x;
5133    int use_native_c_format = 1;
5134
5135    x = PyInt_AsLong(v);
5136    if (x == -1 && PyErr_Occurred())
5137	return -1;
5138    if (prec < 0)
5139	prec = 1;
5140    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5141       worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5142    if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5143        PyErr_SetString(PyExc_OverflowError,
5144            "formatted integer is too long (precision too long?)");
5145        return -1;
5146    }
5147    /* When converting 0 under %#x or %#X, C leaves off the base marker,
5148     * but we want it (for consistency with other %#x conversions, and
5149     * for consistency with Python's hex() function).
5150     * BUG 28-Apr-2001 tim:  At least two platform Cs (Metrowerks &
5151     * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5152     * So add it only if the platform doesn't already.
5153     */
5154    if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5155        /* Only way to know what the platform does is to try it. */
5156        PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0);
5157        if (fmt[1] != (char)type) {
5158            /* Supply our own leading 0x/0X -- needed under std C */
5159            use_native_c_format = 0;
5160            PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type);
5161        }
5162    }
5163    if (use_native_c_format)
5164         PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5165		       (flags & F_ALT) ? "#" : "", prec, type);
5166    return usprintf(buf, fmt, x);
5167}
5168
5169static int
5170formatchar(Py_UNICODE *buf,
5171           size_t buflen,
5172           PyObject *v)
5173{
5174    /* presume that the buffer is at least 2 characters long */
5175    if (PyUnicode_Check(v)) {
5176	if (PyUnicode_GET_SIZE(v) != 1)
5177	    goto onError;
5178	buf[0] = PyUnicode_AS_UNICODE(v)[0];
5179    }
5180
5181    else if (PyString_Check(v)) {
5182	if (PyString_GET_SIZE(v) != 1)
5183	    goto onError;
5184	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5185    }
5186
5187    else {
5188	/* Integer input truncated to a character */
5189        long x;
5190	x = PyInt_AsLong(v);
5191	if (x == -1 && PyErr_Occurred())
5192	    goto onError;
5193	buf[0] = (char) x;
5194    }
5195    buf[1] = '\0';
5196    return 1;
5197
5198 onError:
5199    PyErr_SetString(PyExc_TypeError,
5200		    "%c requires int or char");
5201    return -1;
5202}
5203
5204/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5205
5206   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5207   chars are formatted. XXX This is a magic number. Each formatting
5208   routine does bounds checking to ensure no overflow, but a better
5209   solution may be to malloc a buffer of appropriate size for each
5210   format. For now, the current solution is sufficient.
5211*/
5212#define FORMATBUFLEN (size_t)120
5213
5214PyObject *PyUnicode_Format(PyObject *format,
5215			   PyObject *args)
5216{
5217    Py_UNICODE *fmt, *res;
5218    int fmtcnt, rescnt, reslen, arglen, argidx;
5219    int args_owned = 0;
5220    PyUnicodeObject *result = NULL;
5221    PyObject *dict = NULL;
5222    PyObject *uformat;
5223
5224    if (format == NULL || args == NULL) {
5225	PyErr_BadInternalCall();
5226	return NULL;
5227    }
5228    uformat = PyUnicode_FromObject(format);
5229    if (uformat == NULL)
5230	return NULL;
5231    fmt = PyUnicode_AS_UNICODE(uformat);
5232    fmtcnt = PyUnicode_GET_SIZE(uformat);
5233
5234    reslen = rescnt = fmtcnt + 100;
5235    result = _PyUnicode_New(reslen);
5236    if (result == NULL)
5237	goto onError;
5238    res = PyUnicode_AS_UNICODE(result);
5239
5240    if (PyTuple_Check(args)) {
5241	arglen = PyTuple_Size(args);
5242	argidx = 0;
5243    }
5244    else {
5245	arglen = -1;
5246	argidx = -2;
5247    }
5248    if (args->ob_type->tp_as_mapping)
5249	dict = args;
5250
5251    while (--fmtcnt >= 0) {
5252	if (*fmt != '%') {
5253	    if (--rescnt < 0) {
5254		rescnt = fmtcnt + 100;
5255		reslen += rescnt;
5256		if (_PyUnicode_Resize(&result, reslen) < 0)
5257		    return NULL;
5258		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5259		--rescnt;
5260	    }
5261	    *res++ = *fmt++;
5262	}
5263	else {
5264	    /* Got a format specifier */
5265	    int flags = 0;
5266	    int width = -1;
5267	    int prec = -1;
5268	    Py_UNICODE c = '\0';
5269	    Py_UNICODE fill;
5270	    PyObject *v = NULL;
5271	    PyObject *temp = NULL;
5272	    Py_UNICODE *pbuf;
5273	    Py_UNICODE sign;
5274	    int len;
5275	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5276
5277	    fmt++;
5278	    if (*fmt == '(') {
5279		Py_UNICODE *keystart;
5280		int keylen;
5281		PyObject *key;
5282		int pcount = 1;
5283
5284		if (dict == NULL) {
5285		    PyErr_SetString(PyExc_TypeError,
5286				    "format requires a mapping");
5287		    goto onError;
5288		}
5289		++fmt;
5290		--fmtcnt;
5291		keystart = fmt;
5292		/* Skip over balanced parentheses */
5293		while (pcount > 0 && --fmtcnt >= 0) {
5294		    if (*fmt == ')')
5295			--pcount;
5296		    else if (*fmt == '(')
5297			++pcount;
5298		    fmt++;
5299		}
5300		keylen = fmt - keystart - 1;
5301		if (fmtcnt < 0 || pcount > 0) {
5302		    PyErr_SetString(PyExc_ValueError,
5303				    "incomplete format key");
5304		    goto onError;
5305		}
5306#if 0
5307		/* keys are converted to strings using UTF-8 and
5308		   then looked up since Python uses strings to hold
5309		   variables names etc. in its namespaces and we
5310		   wouldn't want to break common idioms. */
5311		key = PyUnicode_EncodeUTF8(keystart,
5312					   keylen,
5313					   NULL);
5314#else
5315		key = PyUnicode_FromUnicode(keystart, keylen);
5316#endif
5317		if (key == NULL)
5318		    goto onError;
5319		if (args_owned) {
5320		    Py_DECREF(args);
5321		    args_owned = 0;
5322		}
5323		args = PyObject_GetItem(dict, key);
5324		Py_DECREF(key);
5325		if (args == NULL) {
5326		    goto onError;
5327		}
5328		args_owned = 1;
5329		arglen = -1;
5330		argidx = -2;
5331	    }
5332	    while (--fmtcnt >= 0) {
5333		switch (c = *fmt++) {
5334		case '-': flags |= F_LJUST; continue;
5335		case '+': flags |= F_SIGN; continue;
5336		case ' ': flags |= F_BLANK; continue;
5337		case '#': flags |= F_ALT; continue;
5338		case '0': flags |= F_ZERO; continue;
5339		}
5340		break;
5341	    }
5342	    if (c == '*') {
5343		v = getnextarg(args, arglen, &argidx);
5344		if (v == NULL)
5345		    goto onError;
5346		if (!PyInt_Check(v)) {
5347		    PyErr_SetString(PyExc_TypeError,
5348				    "* wants int");
5349		    goto onError;
5350		}
5351		width = PyInt_AsLong(v);
5352		if (width < 0) {
5353		    flags |= F_LJUST;
5354		    width = -width;
5355		}
5356		if (--fmtcnt >= 0)
5357		    c = *fmt++;
5358	    }
5359	    else if (c >= '0' && c <= '9') {
5360		width = c - '0';
5361		while (--fmtcnt >= 0) {
5362		    c = *fmt++;
5363		    if (c < '0' || c > '9')
5364			break;
5365		    if ((width*10) / 10 != width) {
5366			PyErr_SetString(PyExc_ValueError,
5367					"width too big");
5368			goto onError;
5369		    }
5370		    width = width*10 + (c - '0');
5371		}
5372	    }
5373	    if (c == '.') {
5374		prec = 0;
5375		if (--fmtcnt >= 0)
5376		    c = *fmt++;
5377		if (c == '*') {
5378		    v = getnextarg(args, arglen, &argidx);
5379		    if (v == NULL)
5380			goto onError;
5381		    if (!PyInt_Check(v)) {
5382			PyErr_SetString(PyExc_TypeError,
5383					"* wants int");
5384			goto onError;
5385		    }
5386		    prec = PyInt_AsLong(v);
5387		    if (prec < 0)
5388			prec = 0;
5389		    if (--fmtcnt >= 0)
5390			c = *fmt++;
5391		}
5392		else if (c >= '0' && c <= '9') {
5393		    prec = c - '0';
5394		    while (--fmtcnt >= 0) {
5395			c = Py_CHARMASK(*fmt++);
5396			if (c < '0' || c > '9')
5397			    break;
5398			if ((prec*10) / 10 != prec) {
5399			    PyErr_SetString(PyExc_ValueError,
5400					    "prec too big");
5401			    goto onError;
5402			}
5403			prec = prec*10 + (c - '0');
5404		    }
5405		}
5406	    } /* prec */
5407	    if (fmtcnt >= 0) {
5408		if (c == 'h' || c == 'l' || c == 'L') {
5409		    if (--fmtcnt >= 0)
5410			c = *fmt++;
5411		}
5412	    }
5413	    if (fmtcnt < 0) {
5414		PyErr_SetString(PyExc_ValueError,
5415				"incomplete format");
5416		goto onError;
5417	    }
5418	    if (c != '%') {
5419		v = getnextarg(args, arglen, &argidx);
5420		if (v == NULL)
5421		    goto onError;
5422	    }
5423	    sign = 0;
5424	    fill = ' ';
5425	    switch (c) {
5426
5427	    case '%':
5428		pbuf = formatbuf;
5429		/* presume that buffer length is at least 1 */
5430		pbuf[0] = '%';
5431		len = 1;
5432		break;
5433
5434	    case 's':
5435	    case 'r':
5436		if (PyUnicode_Check(v) && c == 's') {
5437		    temp = v;
5438		    Py_INCREF(temp);
5439		}
5440		else {
5441		    PyObject *unicode;
5442		    if (c == 's')
5443			temp = PyObject_Str(v);
5444		    else
5445			temp = PyObject_Repr(v);
5446		    if (temp == NULL)
5447			goto onError;
5448		    if (!PyString_Check(temp)) {
5449			/* XXX Note: this should never happen, since
5450   			       PyObject_Repr() and PyObject_Str() assure
5451			       this */
5452			Py_DECREF(temp);
5453			PyErr_SetString(PyExc_TypeError,
5454					"%s argument has non-string str()");
5455			goto onError;
5456		    }
5457		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5458						   PyString_GET_SIZE(temp),
5459					       NULL,
5460						   "strict");
5461		    Py_DECREF(temp);
5462		    temp = unicode;
5463		    if (temp == NULL)
5464			goto onError;
5465		}
5466		pbuf = PyUnicode_AS_UNICODE(temp);
5467		len = PyUnicode_GET_SIZE(temp);
5468		if (prec >= 0 && len > prec)
5469		    len = prec;
5470		break;
5471
5472	    case 'i':
5473	    case 'd':
5474	    case 'u':
5475	    case 'o':
5476	    case 'x':
5477	    case 'X':
5478		if (c == 'i')
5479		    c = 'd';
5480		if (PyLong_Check(v)) {
5481		    temp = formatlong(v, flags, prec, c);
5482		    if (!temp)
5483			goto onError;
5484		    pbuf = PyUnicode_AS_UNICODE(temp);
5485		    len = PyUnicode_GET_SIZE(temp);
5486		    /* unbounded ints can always produce
5487		       a sign character! */
5488		    sign = 1;
5489		}
5490		else {
5491		    pbuf = formatbuf;
5492		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5493				    flags, prec, c, v);
5494		    if (len < 0)
5495			goto onError;
5496		    /* only d conversion is signed */
5497		    sign = c == 'd';
5498		}
5499		if (flags & F_ZERO)
5500		    fill = '0';
5501		break;
5502
5503	    case 'e':
5504	    case 'E':
5505	    case 'f':
5506	    case 'g':
5507	    case 'G':
5508		pbuf = formatbuf;
5509		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5510			flags, prec, c, v);
5511		if (len < 0)
5512		    goto onError;
5513		sign = 1;
5514		if (flags & F_ZERO)
5515		    fill = '0';
5516		break;
5517
5518	    case 'c':
5519		pbuf = formatbuf;
5520		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5521		if (len < 0)
5522		    goto onError;
5523		break;
5524
5525	    default:
5526		PyErr_Format(PyExc_ValueError,
5527			     "unsupported format character '%c' (0x%x) "
5528			     "at index %i",
5529			     (31<=c && c<=126) ? c : '?',
5530                             c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5531		goto onError;
5532	    }
5533	    if (sign) {
5534		if (*pbuf == '-' || *pbuf == '+') {
5535		    sign = *pbuf++;
5536		    len--;
5537		}
5538		else if (flags & F_SIGN)
5539		    sign = '+';
5540		else if (flags & F_BLANK)
5541		    sign = ' ';
5542		else
5543		    sign = 0;
5544	    }
5545	    if (width < len)
5546		width = len;
5547	    if (rescnt < width + (sign != 0)) {
5548		reslen -= rescnt;
5549		rescnt = width + fmtcnt + 100;
5550		reslen += rescnt;
5551		if (_PyUnicode_Resize(&result, reslen) < 0)
5552		    return NULL;
5553		res = PyUnicode_AS_UNICODE(result)
5554		    + reslen - rescnt;
5555	    }
5556	    if (sign) {
5557		if (fill != ' ')
5558		    *res++ = sign;
5559		rescnt--;
5560		if (width > len)
5561		    width--;
5562	    }
5563	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5564		assert(pbuf[0] == '0');
5565		assert(pbuf[1] == c);
5566		if (fill != ' ') {
5567		    *res++ = *pbuf++;
5568		    *res++ = *pbuf++;
5569		}
5570		rescnt -= 2;
5571		width -= 2;
5572		if (width < 0)
5573		    width = 0;
5574		len -= 2;
5575	    }
5576	    if (width > len && !(flags & F_LJUST)) {
5577		do {
5578		    --rescnt;
5579		    *res++ = fill;
5580		} while (--width > len);
5581	    }
5582	    if (fill == ' ') {
5583		if (sign)
5584		    *res++ = sign;
5585		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5586		    assert(pbuf[0] == '0');
5587		    assert(pbuf[1] == c);
5588		    *res++ = *pbuf++;
5589		    *res++ = *pbuf++;
5590		}
5591	    }
5592	    Py_UNICODE_COPY(res, pbuf, len);
5593	    res += len;
5594	    rescnt -= len;
5595	    while (--width >= len) {
5596		--rescnt;
5597		*res++ = ' ';
5598	    }
5599	    if (dict && (argidx < arglen) && c != '%') {
5600		PyErr_SetString(PyExc_TypeError,
5601				"not all arguments converted");
5602		goto onError;
5603	    }
5604	    Py_XDECREF(temp);
5605	} /* '%' */
5606    } /* until end */
5607    if (argidx < arglen && !dict) {
5608	PyErr_SetString(PyExc_TypeError,
5609			"not all arguments converted");
5610	goto onError;
5611    }
5612
5613    if (args_owned) {
5614	Py_DECREF(args);
5615    }
5616    Py_DECREF(uformat);
5617    if (_PyUnicode_Resize(&result, reslen - rescnt))
5618	goto onError;
5619    return (PyObject *)result;
5620
5621 onError:
5622    Py_XDECREF(result);
5623    Py_DECREF(uformat);
5624    if (args_owned) {
5625	Py_DECREF(args);
5626    }
5627    return NULL;
5628}
5629
5630static PyBufferProcs unicode_as_buffer = {
5631    (getreadbufferproc) unicode_buffer_getreadbuf,
5632    (getwritebufferproc) unicode_buffer_getwritebuf,
5633    (getsegcountproc) unicode_buffer_getsegcount,
5634    (getcharbufferproc) unicode_buffer_getcharbuf,
5635};
5636
5637staticforward PyObject *
5638unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5639
5640static PyObject *
5641unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5642{
5643        PyObject *x = NULL;
5644	static char *kwlist[] = {"string", "encoding", "errors", 0};
5645	char *encoding = NULL;
5646	char *errors = NULL;
5647
5648	if (type != &PyUnicode_Type)
5649		return unicode_subtype_new(type, args, kwds);
5650	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5651					  kwlist, &x, &encoding, &errors))
5652	    return NULL;
5653	if (x == NULL)
5654		return (PyObject *)_PyUnicode_New(0);
5655	if (encoding == NULL && errors == NULL)
5656	    return PyObject_Unicode(x);
5657	else
5658	return PyUnicode_FromEncodedObject(x, encoding, errors);
5659}
5660
5661static PyObject *
5662unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5663{
5664	PyUnicodeObject *tmp, *pnew;
5665	int n;
5666
5667	assert(PyType_IsSubtype(type, &PyUnicode_Type));
5668	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5669	if (tmp == NULL)
5670		return NULL;
5671	assert(PyUnicode_Check(tmp));
5672	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5673	if (pnew == NULL)
5674		return NULL;
5675	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5676	if (pnew->str == NULL) {
5677		_Py_ForgetReference((PyObject *)pnew);
5678		PyObject_DEL(pnew);
5679		return NULL;
5680	}
5681	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5682	pnew->length = n;
5683	pnew->hash = tmp->hash;
5684	Py_DECREF(tmp);
5685	return (PyObject *)pnew;
5686}
5687
5688static char unicode_doc[] =
5689"unicode(string [, encoding[, errors]]) -> object\n\
5690\n\
5691Create a new Unicode object from the given encoded string.\n\
5692encoding defaults to the current default string encoding and \n\
5693errors, defining the error handling, to 'strict'.";
5694
5695PyTypeObject PyUnicode_Type = {
5696    PyObject_HEAD_INIT(&PyType_Type)
5697    0, 					/* ob_size */
5698    "unicode", 				/* tp_name */
5699    sizeof(PyUnicodeObject), 		/* tp_size */
5700    0, 					/* tp_itemsize */
5701    /* Slots */
5702    (destructor)unicode_dealloc, 	/* tp_dealloc */
5703    0, 					/* tp_print */
5704    0,				 	/* tp_getattr */
5705    0, 					/* tp_setattr */
5706    (cmpfunc) unicode_compare, 		/* tp_compare */
5707    (reprfunc) unicode_repr, 		/* tp_repr */
5708    0, 					/* tp_as_number */
5709    &unicode_as_sequence, 		/* tp_as_sequence */
5710    0, 					/* tp_as_mapping */
5711    (hashfunc) unicode_hash, 		/* tp_hash*/
5712    0, 					/* tp_call*/
5713    (reprfunc) unicode_str,	 	/* tp_str */
5714    PyObject_GenericGetAttr, 		/* tp_getattro */
5715    0,			 		/* tp_setattro */
5716    &unicode_as_buffer,			/* tp_as_buffer */
5717    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
5718    unicode_doc,			/* tp_doc */
5719    0,					/* tp_traverse */
5720    0,					/* tp_clear */
5721    0,					/* tp_richcompare */
5722    0,					/* tp_weaklistoffset */
5723    0,					/* tp_iter */
5724    0,					/* tp_iternext */
5725    unicode_methods,			/* tp_methods */
5726    0,					/* tp_members */
5727    0,					/* tp_getset */
5728    0,					/* tp_base */
5729    0,					/* tp_dict */
5730    0,					/* tp_descr_get */
5731    0,					/* tp_descr_set */
5732    0,					/* tp_dictoffset */
5733    0,					/* tp_init */
5734    0,					/* tp_alloc */
5735    unicode_new,			/* tp_new */
5736    _PyObject_Del,			/* tp_free */
5737};
5738
5739/* Initialize the Unicode implementation */
5740
5741void _PyUnicode_Init(void)
5742{
5743    int i;
5744
5745    /* Init the implementation */
5746    unicode_freelist = NULL;
5747    unicode_freelist_size = 0;
5748    unicode_empty = _PyUnicode_New(0);
5749    strcpy(unicode_default_encoding, "ascii");
5750    for (i = 0; i < 256; i++)
5751	unicode_latin1[i] = NULL;
5752}
5753
5754/* Finalize the Unicode implementation */
5755
5756void
5757_PyUnicode_Fini(void)
5758{
5759    PyUnicodeObject *u;
5760    int i;
5761
5762    Py_XDECREF(unicode_empty);
5763    unicode_empty = NULL;
5764
5765    for (i = 0; i < 256; i++) {
5766	if (unicode_latin1[i]) {
5767	    Py_DECREF(unicode_latin1[i]);
5768	    unicode_latin1[i] = NULL;
5769	}
5770    }
5771
5772    for (u = unicode_freelist; u != NULL;) {
5773	PyUnicodeObject *v = u;
5774	u = *(PyUnicodeObject **)u;
5775	if (v->str)
5776	    PyMem_DEL(v->str);
5777	Py_XDECREF(v->defenc);
5778	PyObject_DEL(v);
5779    }
5780    unicode_freelist = NULL;
5781    unicode_freelist_size = 0;
5782}
5783