unicodeobject.c revision b5507ecd3cfce17bab26311298f527572611af0b
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9--------------------------------------------------------------------
10The original string type implementation is:
11
12    Copyright (c) 1999 by Secret Labs AB
13    Copyright (c) 1999 by Fredrik Lundh
14
15By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
38
39#include "Python.h"
40
41#include "unicodeobject.h"
42#include "ucnhash.h"
43
44#ifdef MS_WIN32
45#include <windows.h>
46#endif
47
48/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE       1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54   The implementation will keep allocated Unicode memory intact for
55   all objects on the free list having a size less than this
56   limit. This reduces malloc() overhead for small Unicode objects.
57
58   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60   malloc()-overhead) bytes of unused garbage.
61
62   Setting the limit to 0 effectively turns the feature off.
63
64   Note: This is an experimental feature ! If you get core dumps when
65   using Unicode objects, turn this feature off.
66
67*/
68
69#define KEEPALIVE_SIZE_LIMIT       9
70
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
79/* --- Globals ------------------------------------------------------------
80
81   The globals are initialized by the _PyUnicode_Init() API and should
82   not be used before calling that API.
83
84*/
85
86/* Free list for Unicode objects */
87static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
89
90/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94   shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
97/* Default encoding to use and assume when NULL is passed as encoding
98   parameter; it is initialized by _PyUnicode_Init().
99
100   Always use the PyUnicode_SetDefaultEncoding() and
101   PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
104static char unicode_default_encoding[100];
105
106Py_UNICODE
107PyUnicode_GetMax(void)
108{
109#ifdef Py_UNICODE_WIDE
110	return 0x10FFFF;
111#else
112	/* This is actually an illegal character, so it should
113	   not be passed to unichr. */
114	return 0xFFFF;
115#endif
116}
117
118/* --- Unicode Object ----------------------------------------------------- */
119
120static
121int unicode_resize(register PyUnicodeObject *unicode,
122                      int length)
123{
124    void *oldstr;
125
126    /* Shortcut if there's nothing much to do. */
127    if (unicode->length == length)
128	goto reset;
129
130    /* Resizing shared object (unicode_empty or single character
131       objects) in-place is not allowed. Use PyUnicode_Resize()
132       instead ! */
133    if (unicode == unicode_empty ||
134	(unicode->length == 1 &&
135	 unicode->str[0] < 256 &&
136	 unicode_latin1[unicode->str[0]] == unicode)) {
137        PyErr_SetString(PyExc_SystemError,
138                        "can't resize shared unicode objects");
139        return -1;
140    }
141
142    /* We allocate one more byte to make sure the string is
143       Ux0000 terminated -- XXX is this needed ? */
144    oldstr = unicode->str;
145    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146    if (!unicode->str) {
147	unicode->str = oldstr;
148        PyErr_NoMemory();
149        return -1;
150    }
151    unicode->str[length] = 0;
152    unicode->length = length;
153
154 reset:
155    /* Reset the object caches */
156    if (unicode->defenc) {
157        Py_DECREF(unicode->defenc);
158        unicode->defenc = NULL;
159    }
160    unicode->hash = -1;
161
162    return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166   Ux0000 terminated -- XXX is this needed ?
167
168   XXX This allocator could further be enhanced by assuring that the
169       free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176    register PyUnicodeObject *unicode;
177
178    /* Optimization for empty strings */
179    if (length == 0 && unicode_empty != NULL) {
180        Py_INCREF(unicode_empty);
181        return unicode_empty;
182    }
183
184    /* Unicode freelist & memory allocation */
185    if (unicode_freelist) {
186        unicode = unicode_freelist;
187        unicode_freelist = *(PyUnicodeObject **)unicode;
188        unicode_freelist_size--;
189	if (unicode->str) {
190	    /* Keep-Alive optimization: we only upsize the buffer,
191	       never downsize it. */
192	    if ((unicode->length < length) &&
193		unicode_resize(unicode, length)) {
194		PyMem_DEL(unicode->str);
195		goto onError;
196	    }
197	}
198        else {
199	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200        }
201        PyObject_INIT(unicode, &PyUnicode_Type);
202    }
203    else {
204        unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205        if (unicode == NULL)
206            return NULL;
207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208    }
209
210    if (!unicode->str) {
211	PyErr_NoMemory();
212	goto onError;
213    }
214    unicode->str[length] = 0;
215    unicode->length = length;
216    unicode->hash = -1;
217    unicode->defenc = NULL;
218    return unicode;
219
220 onError:
221    _Py_ForgetReference((PyObject *)unicode);
222    PyObject_DEL(unicode);
223    return NULL;
224}
225
226static
227void unicode_dealloc(register PyUnicodeObject *unicode)
228{
229    if (!PyUnicode_CheckExact(unicode)) {
230	unicode->ob_type->tp_free((PyObject *)unicode);
231	return;
232    }
233    if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
234        /* Keep-Alive optimization */
235	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
236	    PyMem_DEL(unicode->str);
237	    unicode->str = NULL;
238	    unicode->length = 0;
239	}
240	if (unicode->defenc) {
241	    Py_DECREF(unicode->defenc);
242	    unicode->defenc = NULL;
243	}
244	/* Add to free list */
245        *(PyUnicodeObject **)unicode = unicode_freelist;
246        unicode_freelist = unicode;
247        unicode_freelist_size++;
248    }
249    else {
250	PyMem_DEL(unicode->str);
251	Py_XDECREF(unicode->defenc);
252	PyObject_DEL(unicode);
253    }
254}
255
256int PyUnicode_Resize(PyObject **unicode,
257		     int length)
258{
259    register PyUnicodeObject *v;
260
261    /* Argument checks */
262    if (unicode == NULL) {
263	PyErr_BadInternalCall();
264	return -1;
265    }
266    v = (PyUnicodeObject *)*unicode;
267    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
268	PyErr_BadInternalCall();
269	return -1;
270    }
271
272    /* Resizing unicode_empty and single character objects is not
273       possible since these are being shared. We simply return a fresh
274       copy with the same Unicode content. */
275    if (v->length != length &&
276	(v == unicode_empty || v->length == 1)) {
277	PyUnicodeObject *w = _PyUnicode_New(length);
278	if (w == NULL)
279	    return -1;
280	Py_UNICODE_COPY(w->str, v->str,
281			length < v->length ? length : v->length);
282	*unicode = (PyObject *)w;
283	return 0;
284    }
285
286    /* Note that we don't have to modify *unicode for unshared Unicode
287       objects, since we can modify them in-place. */
288    return unicode_resize(v, length);
289}
290
291/* Internal API for use in unicodeobject.c only ! */
292#define _PyUnicode_Resize(unicodevar, length) \
293        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
294
295PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
296				int size)
297{
298    PyUnicodeObject *unicode;
299
300    /* If the Unicode data is known at construction time, we can apply
301       some optimizations which share commonly used objects. */
302    if (u != NULL) {
303
304	/* Optimization for empty strings */
305	if (size == 0 && unicode_empty != NULL) {
306	    Py_INCREF(unicode_empty);
307	    return (PyObject *)unicode_empty;
308	}
309
310	/* Single character Unicode objects in the Latin-1 range are
311	   shared when using this constructor */
312	if (size == 1 && *u < 256) {
313	    unicode = unicode_latin1[*u];
314	    if (!unicode) {
315		unicode = _PyUnicode_New(1);
316		if (!unicode)
317		    return NULL;
318		unicode->str[0] = *u;
319		unicode_latin1[*u] = unicode;
320	    }
321	    Py_INCREF(unicode);
322	    return (PyObject *)unicode;
323	}
324    }
325
326    unicode = _PyUnicode_New(size);
327    if (!unicode)
328        return NULL;
329
330    /* Copy the Unicode data into the new object */
331    if (u != NULL)
332	Py_UNICODE_COPY(unicode->str, u, size);
333
334    return (PyObject *)unicode;
335}
336
337#ifdef HAVE_WCHAR_H
338
339PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
340				 int size)
341{
342    PyUnicodeObject *unicode;
343
344    if (w == NULL) {
345	PyErr_BadInternalCall();
346	return NULL;
347    }
348
349    unicode = _PyUnicode_New(size);
350    if (!unicode)
351        return NULL;
352
353    /* Copy the wchar_t data into the new object */
354#ifdef HAVE_USABLE_WCHAR_T
355    memcpy(unicode->str, w, size * sizeof(wchar_t));
356#else
357    {
358	register Py_UNICODE *u;
359	register int i;
360	u = PyUnicode_AS_UNICODE(unicode);
361	for (i = size; i >= 0; i--)
362	    *u++ = *w++;
363    }
364#endif
365
366    return (PyObject *)unicode;
367}
368
369int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
370			 register wchar_t *w,
371			 int size)
372{
373    if (unicode == NULL) {
374	PyErr_BadInternalCall();
375	return -1;
376    }
377    if (size > PyUnicode_GET_SIZE(unicode))
378	size = PyUnicode_GET_SIZE(unicode);
379#ifdef HAVE_USABLE_WCHAR_T
380    memcpy(w, unicode->str, size * sizeof(wchar_t));
381#else
382    {
383	register Py_UNICODE *u;
384	register int i;
385	u = PyUnicode_AS_UNICODE(unicode);
386	for (i = size; i >= 0; i--)
387	    *w++ = *u++;
388    }
389#endif
390
391    return size;
392}
393
394#endif
395
396PyObject *PyUnicode_FromObject(register PyObject *obj)
397{
398    /* XXX Perhaps we should make this API an alias of
399           PyObject_Unicode() instead ?! */
400    if (PyUnicode_CheckExact(obj)) {
401	Py_INCREF(obj);
402	return obj;
403    }
404    if (PyUnicode_Check(obj)) {
405	/* For a Unicode subtype that's not a Unicode object,
406	   return a true Unicode object with the same data. */
407	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
408				     PyUnicode_GET_SIZE(obj));
409    }
410    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
411}
412
413PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
414				      const char *encoding,
415				      const char *errors)
416{
417    const char *s = NULL;
418    int len;
419    int owned = 0;
420    PyObject *v;
421
422    if (obj == NULL) {
423	PyErr_BadInternalCall();
424	return NULL;
425    }
426
427#if 0
428    /* For b/w compatibility we also accept Unicode objects provided
429       that no encodings is given and then redirect to
430       PyObject_Unicode() which then applies the additional logic for
431       Unicode subclasses.
432
433       NOTE: This API should really only be used for object which
434             represent *encoded* Unicode !
435
436    */
437	if (PyUnicode_Check(obj)) {
438	    if (encoding) {
439		PyErr_SetString(PyExc_TypeError,
440				"decoding Unicode is not supported");
441	    return NULL;
442	    }
443	return PyObject_Unicode(obj);
444	    }
445#else
446    if (PyUnicode_Check(obj)) {
447	PyErr_SetString(PyExc_TypeError,
448			"decoding Unicode is not supported");
449	return NULL;
450	}
451#endif
452
453    /* Coerce object */
454    if (PyString_Check(obj)) {
455	    s = PyString_AS_STRING(obj);
456	    len = PyString_GET_SIZE(obj);
457	    }
458    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
459	/* Overwrite the error message with something more useful in
460	   case of a TypeError. */
461	if (PyErr_ExceptionMatches(PyExc_TypeError))
462	PyErr_Format(PyExc_TypeError,
463			 "coercing to Unicode: need string or buffer, "
464			 "%.80s found",
465		     obj->ob_type->tp_name);
466	goto onError;
467    }
468
469    /* Convert to Unicode */
470    if (len == 0) {
471	Py_INCREF(unicode_empty);
472	v = (PyObject *)unicode_empty;
473    }
474    else
475	v = PyUnicode_Decode(s, len, encoding, errors);
476
477    if (owned) {
478	Py_DECREF(obj);
479    }
480    return v;
481
482 onError:
483    if (owned) {
484	Py_DECREF(obj);
485    }
486    return NULL;
487}
488
489PyObject *PyUnicode_Decode(const char *s,
490			   int size,
491			   const char *encoding,
492			   const char *errors)
493{
494    PyObject *buffer = NULL, *unicode;
495
496    if (encoding == NULL)
497	encoding = PyUnicode_GetDefaultEncoding();
498
499    /* Shortcuts for common default encodings */
500    if (strcmp(encoding, "utf-8") == 0)
501        return PyUnicode_DecodeUTF8(s, size, errors);
502    else if (strcmp(encoding, "latin-1") == 0)
503        return PyUnicode_DecodeLatin1(s, size, errors);
504    else if (strcmp(encoding, "ascii") == 0)
505        return PyUnicode_DecodeASCII(s, size, errors);
506
507    /* Decode via the codec registry */
508    buffer = PyBuffer_FromMemory((void *)s, size);
509    if (buffer == NULL)
510        goto onError;
511    unicode = PyCodec_Decode(buffer, encoding, errors);
512    if (unicode == NULL)
513        goto onError;
514    if (!PyUnicode_Check(unicode)) {
515        PyErr_Format(PyExc_TypeError,
516                     "decoder did not return an unicode object (type=%.400s)",
517                     unicode->ob_type->tp_name);
518        Py_DECREF(unicode);
519        goto onError;
520    }
521    Py_DECREF(buffer);
522    return unicode;
523
524 onError:
525    Py_XDECREF(buffer);
526    return NULL;
527}
528
529PyObject *PyUnicode_Encode(const Py_UNICODE *s,
530			   int size,
531			   const char *encoding,
532			   const char *errors)
533{
534    PyObject *v, *unicode;
535
536    unicode = PyUnicode_FromUnicode(s, size);
537    if (unicode == NULL)
538	return NULL;
539    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
540    Py_DECREF(unicode);
541    return v;
542}
543
544PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
545                                    const char *encoding,
546                                    const char *errors)
547{
548    PyObject *v;
549
550    if (!PyUnicode_Check(unicode)) {
551        PyErr_BadArgument();
552        goto onError;
553    }
554
555    if (encoding == NULL)
556	encoding = PyUnicode_GetDefaultEncoding();
557
558    /* Shortcuts for common default encodings */
559    if (errors == NULL) {
560	if (strcmp(encoding, "utf-8") == 0)
561	    return PyUnicode_AsUTF8String(unicode);
562	else if (strcmp(encoding, "latin-1") == 0)
563	    return PyUnicode_AsLatin1String(unicode);
564	else if (strcmp(encoding, "ascii") == 0)
565	    return PyUnicode_AsASCIIString(unicode);
566    }
567
568    /* Encode via the codec registry */
569    v = PyCodec_Encode(unicode, encoding, errors);
570    if (v == NULL)
571        goto onError;
572    /* XXX Should we really enforce this ? */
573    if (!PyString_Check(v)) {
574        PyErr_Format(PyExc_TypeError,
575                     "encoder did not return a string object (type=%.400s)",
576                     v->ob_type->tp_name);
577        Py_DECREF(v);
578        goto onError;
579    }
580    return v;
581
582 onError:
583    return NULL;
584}
585
586PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
587					    const char *errors)
588{
589    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
590
591    if (v)
592        return v;
593    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
594    if (v && errors == NULL)
595        ((PyUnicodeObject *)unicode)->defenc = v;
596    return v;
597}
598
599Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
600{
601    if (!PyUnicode_Check(unicode)) {
602        PyErr_BadArgument();
603        goto onError;
604    }
605    return PyUnicode_AS_UNICODE(unicode);
606
607 onError:
608    return NULL;
609}
610
611int PyUnicode_GetSize(PyObject *unicode)
612{
613    if (!PyUnicode_Check(unicode)) {
614        PyErr_BadArgument();
615        goto onError;
616    }
617    return PyUnicode_GET_SIZE(unicode);
618
619 onError:
620    return -1;
621}
622
623const char *PyUnicode_GetDefaultEncoding(void)
624{
625    return unicode_default_encoding;
626}
627
628int PyUnicode_SetDefaultEncoding(const char *encoding)
629{
630    PyObject *v;
631
632    /* Make sure the encoding is valid. As side effect, this also
633       loads the encoding into the codec registry cache. */
634    v = _PyCodec_Lookup(encoding);
635    if (v == NULL)
636	goto onError;
637    Py_DECREF(v);
638    strncpy(unicode_default_encoding,
639	    encoding,
640	    sizeof(unicode_default_encoding));
641    return 0;
642
643 onError:
644    return -1;
645}
646
647/* --- UTF-7 Codec -------------------------------------------------------- */
648
649/* see RFC2152 for details */
650
651static
652char utf7_special[128] = {
653    /* indicate whether a UTF-7 character is special i.e. cannot be directly
654       encoded:
655	   0 - not special
656	   1 - special
657	   2 - whitespace (optional)
658	   3 - RFC2152 Set O (optional) */
659    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
660    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
661    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
662    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
663    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
664    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
665    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
666    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
667
668};
669
670#define SPECIAL(c, encodeO, encodeWS) \
671	(((c)>127 || utf7_special[(c)] == 1) || \
672	 (encodeWS && (utf7_special[(c)] == 2)) || \
673     (encodeO && (utf7_special[(c)] == 3)))
674
675#define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
676#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
677#define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
678                        (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
679
680#define ENCODE(out, ch, bits) \
681    while (bits >= 6) { \
682        *out++ = B64(ch >> (bits-6)); \
683        bits -= 6; \
684    }
685
686#define DECODE(out, ch, bits, surrogate) \
687    while (bits >= 16) { \
688        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
689        bits -= 16; \
690		if (surrogate) { \
691			/* We have already generated an error for the high surrogate
692               so let's not bother seeing if the low surrogate is correct or not */\
693			surrogate = 0; \
694		} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
695            /* This is a surrogate pair. Unfortunately we can't represent \
696               it in a 16-bit character */ \
697			surrogate = 1; \
698            errmsg = "code pairs are not supported"; \
699	        goto utf7Error; \
700		} else { \
701				*out++ = outCh; \
702		} \
703    } \
704
705static
706int utf7_decoding_error(Py_UNICODE **dest,
707                        const char *errors,
708                        const char *details)
709{
710    if ((errors == NULL) ||
711        (strcmp(errors,"strict") == 0)) {
712        PyErr_Format(PyExc_UnicodeError,
713                     "UTF-7 decoding error: %.400s",
714                     details);
715        return -1;
716    }
717    else if (strcmp(errors,"ignore") == 0) {
718        return 0;
719    }
720    else if (strcmp(errors,"replace") == 0) {
721        if (dest != NULL) {
722            **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
723            (*dest)++;
724        }
725        return 0;
726    }
727    else {
728        PyErr_Format(PyExc_ValueError,
729                     "UTF-7 decoding error; unknown error handling code: %.400s",
730                     errors);
731        return -1;
732    }
733}
734
735PyObject *PyUnicode_DecodeUTF7(const char *s,
736			       int size,
737			       const char *errors)
738{
739    const char *e;
740    PyUnicodeObject *unicode;
741    Py_UNICODE *p;
742    const char *errmsg = "";
743    int inShift = 0;
744    unsigned int bitsleft = 0;
745    unsigned long charsleft = 0;
746	int surrogate = 0;
747
748    unicode = _PyUnicode_New(size);
749    if (!unicode)
750        return NULL;
751    if (size == 0)
752        return (PyObject *)unicode;
753
754    p = unicode->str;
755    e = s + size;
756
757    while (s < e) {
758        Py_UNICODE ch = *s;
759
760        if (inShift) {
761            if ((ch == '-') || !B64CHAR(ch)) {
762                inShift = 0;
763                s++;
764
765                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
766                if (bitsleft >= 6) {
767                    /* The shift sequence has a partial character in it. If
768                       bitsleft < 6 then we could just classify it as padding
769                       but that is not the case here */
770
771                    errmsg = "partial character in shift sequence";
772                    goto utf7Error;
773                }
774                /* According to RFC2152 the remaining bits should be zero. We
775                   choose to signal an error/insert a replacement character
776                   here so indicate the potential of a misencoded character. */
777
778                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
779                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
780                    errmsg = "non-zero padding bits in shift sequence";
781                    goto utf7Error;
782                }
783
784                if (ch == '-') {
785                    if ((s < e) && (*(s) == '-')) {
786                        *p++ = '-';
787                        inShift = 1;
788                    }
789                } else if (SPECIAL(ch,0,0)) {
790                    errmsg = "unexpected special character";
791	                goto utf7Error;
792                } else  {
793                    *p++ = ch;
794                }
795            } else {
796                charsleft = (charsleft << 6) | UB64(ch);
797                bitsleft += 6;
798                s++;
799                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
800            }
801        }
802        else if ( ch == '+' ) {
803            s++;
804            if (s < e && *s == '-') {
805                s++;
806                *p++ = '+';
807            } else
808            {
809                inShift = 1;
810                bitsleft = 0;
811            }
812        }
813        else if (SPECIAL(ch,0,0)) {
814            errmsg = "unexpected special character";
815            s++;
816	        goto utf7Error;
817        }
818        else {
819            *p++ = ch;
820            s++;
821        }
822        continue;
823    utf7Error:
824      if (utf7_decoding_error(&p, errors, errmsg))
825          goto onError;
826    }
827
828    if (inShift) {
829        if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
830            goto onError;
831    }
832
833    if (_PyUnicode_Resize(&unicode, p - unicode->str))
834        goto onError;
835
836    return (PyObject *)unicode;
837
838onError:
839    Py_DECREF(unicode);
840    return NULL;
841}
842
843
844PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
845                   int size,
846                   int encodeSetO,
847                   int encodeWhiteSpace,
848                   const char *errors)
849{
850    PyObject *v;
851    /* It might be possible to tighten this worst case */
852    unsigned int cbAllocated = 5 * size;
853    int inShift = 0;
854    int i = 0;
855    unsigned int bitsleft = 0;
856    unsigned long charsleft = 0;
857    char * out;
858    char * start;
859
860    if (size == 0)
861		return PyString_FromStringAndSize(NULL, 0);
862
863    v = PyString_FromStringAndSize(NULL, cbAllocated);
864    if (v == NULL)
865        return NULL;
866
867    start = out = PyString_AS_STRING(v);
868    for (;i < size; ++i) {
869        Py_UNICODE ch = s[i];
870
871        if (!inShift) {
872			if (ch == '+') {
873				*out++ = '+';
874                *out++ = '-';
875            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
876                charsleft = ch;
877                bitsleft = 16;
878                *out++ = '+';
879				/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
880                inShift = bitsleft > 0;
881			} else {
882				*out++ = (char) ch;
883			}
884		} else {
885            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
886                *out++ = B64(charsleft << (6-bitsleft));
887                charsleft = 0;
888                bitsleft = 0;
889                /* Characters not in the BASE64 set implicitly unshift the sequence
890                   so no '-' is required, except if the character is itself a '-' */
891                if (B64CHAR(ch) || ch == '-') {
892                    *out++ = '-';
893                }
894                inShift = 0;
895                *out++ = (char) ch;
896            } else {
897                bitsleft += 16;
898                charsleft = (charsleft << 16) | ch;
899                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
900
901                /* If the next character is special then we dont' need to terminate
902                   the shift sequence. If the next character is not a BASE64 character
903                   or '-' then the shift sequence will be terminated implicitly and we
904                   don't have to insert a '-'. */
905
906                if (bitsleft == 0) {
907                    if (i + 1 < size) {
908                        Py_UNICODE ch2 = s[i+1];
909
910                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
911
912                        } else if (B64CHAR(ch2) || ch2 == '-') {
913                            *out++ = '-';
914                            inShift = 0;
915                        } else {
916                            inShift = 0;
917                        }
918
919                    }
920                    else {
921                        *out++ = '-';
922                        inShift = 0;
923                    }
924                }
925            }
926        }
927	}
928    if (bitsleft) {
929        *out++= B64(charsleft << (6-bitsleft) );
930        *out++ = '-';
931    }
932
933    if (_PyString_Resize(&v, out - start)) {
934        Py_DECREF(v);
935        return NULL;
936    }
937    return v;
938}
939
940#undef SPECIAL
941#undef B64
942#undef B64CHAR
943#undef UB64
944#undef ENCODE
945#undef DECODE
946
947/* --- UTF-8 Codec -------------------------------------------------------- */
948
949static
950char utf8_code_length[256] = {
951    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
952       illegal prefix.  see RFC 2279 for details */
953    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
959    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
960    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
961    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
963    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
964    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
965    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
966    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
967    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
968    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
969};
970
971static
972int utf8_decoding_error(const char **source,
973                        Py_UNICODE **dest,
974                        const char *errors,
975                        const char *details)
976{
977    if ((errors == NULL) ||
978        (strcmp(errors,"strict") == 0)) {
979        PyErr_Format(PyExc_UnicodeError,
980                     "UTF-8 decoding error: %.400s",
981                     details);
982        return -1;
983    }
984    else if (strcmp(errors,"ignore") == 0) {
985        (*source)++;
986        return 0;
987    }
988    else if (strcmp(errors,"replace") == 0) {
989        (*source)++;
990        **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
991        (*dest)++;
992        return 0;
993    }
994    else {
995        PyErr_Format(PyExc_ValueError,
996                     "UTF-8 decoding error; unknown error handling code: %.400s",
997                     errors);
998        return -1;
999    }
1000}
1001
1002PyObject *PyUnicode_DecodeUTF8(const char *s,
1003			       int size,
1004			       const char *errors)
1005{
1006    int n;
1007    const char *e;
1008    PyUnicodeObject *unicode;
1009    Py_UNICODE *p;
1010    const char *errmsg = "";
1011
1012    /* Note: size will always be longer than the resulting Unicode
1013       character count */
1014    unicode = _PyUnicode_New(size);
1015    if (!unicode)
1016        return NULL;
1017    if (size == 0)
1018        return (PyObject *)unicode;
1019
1020    /* Unpack UTF-8 encoded data */
1021    p = unicode->str;
1022    e = s + size;
1023
1024    while (s < e) {
1025        Py_UCS4 ch = (unsigned char)*s;
1026
1027        if (ch < 0x80) {
1028            *p++ = (Py_UNICODE)ch;
1029            s++;
1030            continue;
1031        }
1032
1033        n = utf8_code_length[ch];
1034
1035        if (s + n > e) {
1036	    errmsg = "unexpected end of data";
1037	    goto utf8Error;
1038	}
1039
1040        switch (n) {
1041
1042        case 0:
1043            errmsg = "unexpected code byte";
1044	    goto utf8Error;
1045
1046        case 1:
1047            errmsg = "internal error";
1048	    goto utf8Error;
1049
1050        case 2:
1051            if ((s[1] & 0xc0) != 0x80) {
1052                errmsg = "invalid data";
1053		goto utf8Error;
1054	    }
1055            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1056            if (ch < 0x80) {
1057                errmsg = "illegal encoding";
1058		goto utf8Error;
1059	    }
1060	    else
1061		*p++ = (Py_UNICODE)ch;
1062            break;
1063
1064        case 3:
1065            if ((s[1] & 0xc0) != 0x80 ||
1066                (s[2] & 0xc0) != 0x80) {
1067                errmsg = "invalid data";
1068		goto utf8Error;
1069	    }
1070            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1071            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1072                errmsg = "illegal encoding";
1073		goto utf8Error;
1074	    }
1075	    else
1076				*p++ = (Py_UNICODE)ch;
1077            break;
1078
1079        case 4:
1080            if ((s[1] & 0xc0) != 0x80 ||
1081                (s[2] & 0xc0) != 0x80 ||
1082                (s[3] & 0xc0) != 0x80) {
1083                errmsg = "invalid data";
1084		goto utf8Error;
1085	    }
1086            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1087                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1088            /* validate and convert to UTF-16 */
1089            if ((ch < 0x10000)        /* minimum value allowed for 4
1090                                       byte encoding */
1091                || (ch > 0x10ffff))   /* maximum value allowed for
1092                                       UTF-16 */
1093	    {
1094                errmsg = "illegal encoding";
1095		goto utf8Error;
1096	    }
1097#ifdef Py_UNICODE_WIDE
1098	    *p++ = (Py_UNICODE)ch;
1099#else
1100            /*  compute and append the two surrogates: */
1101
1102            /*  translate from 10000..10FFFF to 0..FFFF */
1103            ch -= 0x10000;
1104
1105            /*  high surrogate = top 10 bits added to D800 */
1106            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1107
1108            /*  low surrogate = bottom 10 bits added to DC00 */
1109            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1110#endif
1111            break;
1112
1113        default:
1114            /* Other sizes are only needed for UCS-4 */
1115            errmsg = "unsupported Unicode code range";
1116	    goto utf8Error;
1117        }
1118        s += n;
1119	continue;
1120
1121    utf8Error:
1122      if (utf8_decoding_error(&s, &p, errors, errmsg))
1123          goto onError;
1124    }
1125
1126    /* Adjust length */
1127    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1128        goto onError;
1129
1130    return (PyObject *)unicode;
1131
1132onError:
1133    Py_DECREF(unicode);
1134    return NULL;
1135}
1136
1137/* Not used anymore, now that the encoder supports UTF-16
1138   surrogates. */
1139#if 0
1140static
1141int utf8_encoding_error(const Py_UNICODE **source,
1142			char **dest,
1143			const char *errors,
1144			const char *details)
1145{
1146    if ((errors == NULL) ||
1147	(strcmp(errors,"strict") == 0)) {
1148	PyErr_Format(PyExc_UnicodeError,
1149		     "UTF-8 encoding error: %.400s",
1150		     details);
1151	return -1;
1152    }
1153    else if (strcmp(errors,"ignore") == 0) {
1154	return 0;
1155    }
1156    else if (strcmp(errors,"replace") == 0) {
1157	**dest = '?';
1158	(*dest)++;
1159	return 0;
1160    }
1161    else {
1162	PyErr_Format(PyExc_ValueError,
1163		     "UTF-8 encoding error; "
1164		     "unknown error handling code: %.400s",
1165		     errors);
1166	return -1;
1167    }
1168}
1169#endif
1170
1171PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1172			       int size,
1173			       const char *errors)
1174{
1175    PyObject *v;
1176    char *p;
1177    char *q;
1178    Py_UCS4 ch2;
1179    unsigned int cbAllocated = 3 * size;
1180    unsigned int cbWritten = 0;
1181    int i = 0;
1182
1183    v = PyString_FromStringAndSize(NULL, cbAllocated);
1184    if (v == NULL)
1185        return NULL;
1186    if (size == 0)
1187        return v;
1188
1189    p = q = PyString_AS_STRING(v);
1190    while (i < size) {
1191        Py_UCS4 ch = s[i++];
1192        if (ch < 0x80) {
1193            *p++ = (char) ch;
1194            cbWritten++;
1195        }
1196        else if (ch < 0x0800) {
1197            *p++ = 0xc0 | (ch >> 6);
1198            *p++ = 0x80 | (ch & 0x3f);
1199            cbWritten += 2;
1200        }
1201        else if (ch < 0x10000) {
1202            /* Check for high surrogate */
1203            if (0xD800 <= ch && ch <= 0xDBFF) {
1204                if (i != size) {
1205                    ch2 = s[i];
1206                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1207
1208                        if (cbWritten >= (cbAllocated - 4)) {
1209			    /* Provide enough room for some more
1210			       surrogates */
1211			    cbAllocated += 4*10;
1212                            if (_PyString_Resize(&v, cbAllocated))
1213				goto onError;
1214                        }
1215
1216                        /* combine the two values */
1217                        ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1218
1219                        *p++ = (char)((ch >> 18) | 0xf0);
1220                        *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1221                        i++;
1222                        cbWritten += 4;
1223                    }
1224                }
1225            }
1226            else {
1227                *p++ = (char)(0xe0 | (ch >> 12));
1228                cbWritten += 3;
1229            }
1230            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1231            *p++ = (char)(0x80 | (ch & 0x3f));
1232        } else {
1233            *p++ = 0xf0 | (ch>>18);
1234            *p++ = 0x80 | ((ch>>12) & 0x3f);
1235            *p++ = 0x80 | ((ch>>6) & 0x3f);
1236            *p++ = 0x80 | (ch & 0x3f);
1237            cbWritten += 4;
1238	}
1239    }
1240    *p = '\0';
1241    if (_PyString_Resize(&v, p - q))
1242	goto onError;
1243    return v;
1244
1245 onError:
1246    Py_DECREF(v);
1247    return NULL;
1248}
1249
1250PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1251{
1252    if (!PyUnicode_Check(unicode)) {
1253        PyErr_BadArgument();
1254        return NULL;
1255    }
1256    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1257				PyUnicode_GET_SIZE(unicode),
1258				NULL);
1259}
1260
1261/* --- UTF-16 Codec ------------------------------------------------------- */
1262
1263static
1264int utf16_decoding_error(Py_UNICODE **dest,
1265			 const char *errors,
1266			 const char *details)
1267{
1268    if ((errors == NULL) ||
1269        (strcmp(errors,"strict") == 0)) {
1270        PyErr_Format(PyExc_UnicodeError,
1271                     "UTF-16 decoding error: %.400s",
1272                     details);
1273        return -1;
1274    }
1275    else if (strcmp(errors,"ignore") == 0) {
1276        return 0;
1277    }
1278    else if (strcmp(errors,"replace") == 0) {
1279	if (dest) {
1280	    **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1281	    (*dest)++;
1282	}
1283        return 0;
1284    }
1285    else {
1286        PyErr_Format(PyExc_ValueError,
1287                     "UTF-16 decoding error; "
1288		     "unknown error handling code: %.400s",
1289                     errors);
1290        return -1;
1291    }
1292}
1293
1294PyObject *
1295PyUnicode_DecodeUTF16(const char *s,
1296		      int size,
1297		      const char *errors,
1298		      int *byteorder)
1299{
1300    PyUnicodeObject *unicode;
1301    Py_UNICODE *p;
1302    const unsigned char *q, *e;
1303    int bo = 0;       /* assume native ordering by default */
1304    const char *errmsg = "";
1305    /* Offsets from q for retrieving byte pairs in the right order. */
1306#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1307    int ihi = 1, ilo = 0;
1308#else
1309    int ihi = 0, ilo = 1;
1310#endif
1311
1312    /* size should be an even number */
1313    if (size & 1) {
1314        if (utf16_decoding_error(NULL, errors, "truncated data"))
1315            return NULL;
1316        --size;  /* else ignore the oddball byte */
1317    }
1318
1319    /* Note: size will always be longer than the resulting Unicode
1320       character count */
1321    unicode = _PyUnicode_New(size);
1322    if (!unicode)
1323        return NULL;
1324    if (size == 0)
1325        return (PyObject *)unicode;
1326
1327    /* Unpack UTF-16 encoded data */
1328    p = unicode->str;
1329    q = (unsigned char *)s;
1330    e = q + size;
1331
1332    if (byteorder)
1333        bo = *byteorder;
1334
1335    /* Check for BOM marks (U+FEFF) in the input and adjust current
1336       byte order setting accordingly. In native mode, the leading BOM
1337       mark is skipped, in all other modes, it is copied to the output
1338       stream as-is (giving a ZWNBSP character). */
1339    if (bo == 0) {
1340        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1341#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1342	if (bom == 0xFEFF) {
1343	    q += 2;
1344	    bo = -1;
1345	}
1346        else if (bom == 0xFFFE) {
1347	    q += 2;
1348	    bo = 1;
1349	}
1350#else
1351	if (bom == 0xFEFF) {
1352	    q += 2;
1353	    bo = 1;
1354	}
1355        else if (bom == 0xFFFE) {
1356	    q += 2;
1357	    bo = -1;
1358	}
1359#endif
1360    }
1361
1362    if (bo == -1) {
1363        /* force LE */
1364        ihi = 1;
1365        ilo = 0;
1366    }
1367    else if (bo == 1) {
1368        /* force BE */
1369        ihi = 0;
1370        ilo = 1;
1371    }
1372
1373    while (q < e) {
1374	Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1375	q += 2;
1376
1377	if (ch < 0xD800 || ch > 0xDFFF) {
1378	    *p++ = ch;
1379	    continue;
1380	}
1381
1382	/* UTF-16 code pair: */
1383	if (q >= e) {
1384	    errmsg = "unexpected end of data";
1385	    goto utf16Error;
1386	}
1387	if (0xD800 <= ch && ch <= 0xDBFF) {
1388	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1389	    q += 2;
1390	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1391#ifndef Py_UNICODE_WIDE
1392		*p++ = ch;
1393		*p++ = ch2;
1394#else
1395		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1396#endif
1397		continue;
1398	    }
1399	    else {
1400                errmsg = "illegal UTF-16 surrogate";
1401		goto utf16Error;
1402	    }
1403
1404	}
1405	errmsg = "illegal encoding";
1406	/* Fall through to report the error */
1407
1408    utf16Error:
1409	if (utf16_decoding_error(&p, errors, errmsg))
1410	    goto onError;
1411    }
1412
1413    if (byteorder)
1414        *byteorder = bo;
1415
1416    /* Adjust length */
1417    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1418        goto onError;
1419
1420    return (PyObject *)unicode;
1421
1422onError:
1423    Py_DECREF(unicode);
1424    return NULL;
1425}
1426
1427PyObject *
1428PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1429		      int size,
1430		      const char *errors,
1431		      int byteorder)
1432{
1433    PyObject *v;
1434    unsigned char *p;
1435    int i, pairs;
1436    /* Offsets from p for storing byte pairs in the right order. */
1437#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1438    int ihi = 1, ilo = 0;
1439#else
1440    int ihi = 0, ilo = 1;
1441#endif
1442
1443#define STORECHAR(CH)                   \
1444    do {                                \
1445        p[ihi] = ((CH) >> 8) & 0xff;    \
1446        p[ilo] = (CH) & 0xff;           \
1447        p += 2;                         \
1448    } while(0)
1449
1450    for (i = pairs = 0; i < size; i++)
1451	if (s[i] >= 0x10000)
1452	    pairs++;
1453    v = PyString_FromStringAndSize(NULL,
1454		  2 * (size + pairs + (byteorder == 0)));
1455    if (v == NULL)
1456        return NULL;
1457
1458    p = (unsigned char *)PyString_AS_STRING(v);
1459    if (byteorder == 0)
1460	STORECHAR(0xFEFF);
1461    if (size == 0)
1462        return v;
1463
1464    if (byteorder == -1) {
1465        /* force LE */
1466        ihi = 1;
1467        ilo = 0;
1468    }
1469    else if (byteorder == 1) {
1470        /* force BE */
1471        ihi = 0;
1472        ilo = 1;
1473    }
1474
1475    while (size-- > 0) {
1476	Py_UNICODE ch = *s++;
1477	Py_UNICODE ch2 = 0;
1478	if (ch >= 0x10000) {
1479	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1480	    ch  = 0xD800 | ((ch-0x10000) >> 10);
1481	}
1482        STORECHAR(ch);
1483        if (ch2)
1484            STORECHAR(ch2);
1485    }
1486    return v;
1487#undef STORECHAR
1488}
1489
1490PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1491{
1492    if (!PyUnicode_Check(unicode)) {
1493        PyErr_BadArgument();
1494        return NULL;
1495    }
1496    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1497				 PyUnicode_GET_SIZE(unicode),
1498				 NULL,
1499				 0);
1500}
1501
1502/* --- Unicode Escape Codec ----------------------------------------------- */
1503
1504static
1505int unicodeescape_decoding_error(const char **source,
1506                                 Py_UNICODE *x,
1507                                 const char *errors,
1508                                 const char *details)
1509{
1510    if ((errors == NULL) ||
1511        (strcmp(errors,"strict") == 0)) {
1512        PyErr_Format(PyExc_UnicodeError,
1513                     "Unicode-Escape decoding error: %.400s",
1514                     details);
1515        return -1;
1516    }
1517    else if (strcmp(errors,"ignore") == 0) {
1518        return 0;
1519    }
1520    else if (strcmp(errors,"replace") == 0) {
1521        *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1522        return 0;
1523    }
1524    else {
1525        PyErr_Format(PyExc_ValueError,
1526                     "Unicode-Escape decoding error; "
1527                     "unknown error handling code: %.400s",
1528                     errors);
1529        return -1;
1530    }
1531}
1532
1533static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1534
1535PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1536					int size,
1537					const char *errors)
1538{
1539    PyUnicodeObject *v;
1540    Py_UNICODE *p, *buf;
1541    const char *end;
1542    char* message;
1543    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1544
1545    /* Escaped strings will always be longer than the resulting
1546       Unicode string, so we start with size here and then reduce the
1547       length after conversion to the true value. */
1548    v = _PyUnicode_New(size);
1549    if (v == NULL)
1550        goto onError;
1551    if (size == 0)
1552        return (PyObject *)v;
1553
1554    p = buf = PyUnicode_AS_UNICODE(v);
1555    end = s + size;
1556
1557    while (s < end) {
1558        unsigned char c;
1559        Py_UNICODE x;
1560        int i, digits;
1561
1562        /* Non-escape characters are interpreted as Unicode ordinals */
1563        if (*s != '\\') {
1564            *p++ = (unsigned char) *s++;
1565            continue;
1566        }
1567
1568        /* \ - Escapes */
1569        s++;
1570        switch (*s++) {
1571
1572        /* \x escapes */
1573        case '\n': break;
1574        case '\\': *p++ = '\\'; break;
1575        case '\'': *p++ = '\''; break;
1576        case '\"': *p++ = '\"'; break;
1577        case 'b': *p++ = '\b'; break;
1578        case 'f': *p++ = '\014'; break; /* FF */
1579        case 't': *p++ = '\t'; break;
1580        case 'n': *p++ = '\n'; break;
1581        case 'r': *p++ = '\r'; break;
1582        case 'v': *p++ = '\013'; break; /* VT */
1583        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1584
1585        /* \OOO (octal) escapes */
1586        case '0': case '1': case '2': case '3':
1587        case '4': case '5': case '6': case '7':
1588            x = s[-1] - '0';
1589            if ('0' <= *s && *s <= '7') {
1590                x = (x<<3) + *s++ - '0';
1591                if ('0' <= *s && *s <= '7')
1592                    x = (x<<3) + *s++ - '0';
1593            }
1594            *p++ = x;
1595            break;
1596
1597        /* hex escapes */
1598        /* \xXX */
1599        case 'x':
1600            digits = 2;
1601            message = "truncated \\xXX escape";
1602            goto hexescape;
1603
1604        /* \uXXXX */
1605        case 'u':
1606            digits = 4;
1607            message = "truncated \\uXXXX escape";
1608            goto hexescape;
1609
1610        /* \UXXXXXXXX */
1611        case 'U':
1612            digits = 8;
1613            message = "truncated \\UXXXXXXXX escape";
1614        hexescape:
1615            chr = 0;
1616            for (i = 0; i < digits; i++) {
1617                c = (unsigned char) s[i];
1618                if (!isxdigit(c)) {
1619                    if (unicodeescape_decoding_error(&s, &x, errors, message))
1620                        goto onError;
1621                    chr = x;
1622                    i++;
1623                    break;
1624                }
1625                chr = (chr<<4) & ~0xF;
1626                if (c >= '0' && c <= '9')
1627                    chr += c - '0';
1628                else if (c >= 'a' && c <= 'f')
1629                    chr += 10 + c - 'a';
1630                else
1631                    chr += 10 + c - 'A';
1632            }
1633            s += i;
1634        store:
1635            /* when we get here, chr is a 32-bit unicode character */
1636            if (chr <= 0xffff)
1637                /* UCS-2 character */
1638                *p++ = (Py_UNICODE) chr;
1639            else if (chr <= 0x10ffff) {
1640                /* UCS-4 character. Either store directly, or as
1641		   surrogate pair. */
1642#ifdef Py_UNICODE_WIDE
1643                *p++ = chr;
1644#else
1645                chr -= 0x10000L;
1646                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1647                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1648#endif
1649            } else {
1650                if (unicodeescape_decoding_error(
1651                    &s, &x, errors,
1652                    "illegal Unicode character")
1653                    )
1654                    goto onError;
1655                *p++ = x; /* store replacement character */
1656            }
1657            break;
1658
1659        /* \N{name} */
1660        case 'N':
1661            message = "malformed \\N character escape";
1662            if (ucnhash_CAPI == NULL) {
1663                /* load the unicode data module */
1664                PyObject *m, *v;
1665                m = PyImport_ImportModule("unicodedata");
1666                if (m == NULL)
1667                    goto ucnhashError;
1668                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1669                Py_DECREF(m);
1670                if (v == NULL)
1671                    goto ucnhashError;
1672                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1673                Py_DECREF(v);
1674                if (ucnhash_CAPI == NULL)
1675                    goto ucnhashError;
1676            }
1677            if (*s == '{') {
1678                const char *start = s+1;
1679                /* look for the closing brace */
1680                while (*s != '}' && s < end)
1681                    s++;
1682                if (s > start && s < end && *s == '}') {
1683                    /* found a name.  look it up in the unicode database */
1684                    message = "unknown Unicode character name";
1685                    s++;
1686                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1687                        goto store;
1688                }
1689            }
1690            if (unicodeescape_decoding_error(&s, &x, errors, message))
1691                goto onError;
1692            *p++ = x;
1693            break;
1694
1695        default:
1696            *p++ = '\\';
1697            *p++ = (unsigned char)s[-1];
1698            break;
1699        }
1700    }
1701    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1702		goto onError;
1703    return (PyObject *)v;
1704
1705ucnhashError:
1706    PyErr_SetString(
1707        PyExc_UnicodeError,
1708        "\\N escapes not supported (can't load unicodedata module)"
1709        );
1710    return NULL;
1711
1712onError:
1713    Py_XDECREF(v);
1714    return NULL;
1715}
1716
1717/* Return a Unicode-Escape string version of the Unicode object.
1718
1719   If quotes is true, the string is enclosed in u"" or u'' quotes as
1720   appropriate.
1721
1722*/
1723
1724static const Py_UNICODE *findchar(const Py_UNICODE *s,
1725				  int size,
1726				  Py_UNICODE ch);
1727
1728static
1729PyObject *unicodeescape_string(const Py_UNICODE *s,
1730                               int size,
1731                               int quotes)
1732{
1733    PyObject *repr;
1734    char *p;
1735
1736    static const char *hexdigit = "0123456789abcdef";
1737
1738    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1739    if (repr == NULL)
1740        return NULL;
1741
1742    p = PyString_AS_STRING(repr);
1743
1744    if (quotes) {
1745        *p++ = 'u';
1746        *p++ = (findchar(s, size, '\'') &&
1747                !findchar(s, size, '"')) ? '"' : '\'';
1748    }
1749    while (size-- > 0) {
1750        Py_UNICODE ch = *s++;
1751
1752        /* Escape quotes */
1753        if (quotes &&
1754	    (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1755            *p++ = '\\';
1756            *p++ = (char) ch;
1757	    continue;
1758        }
1759
1760#ifdef Py_UNICODE_WIDE
1761        /* Map 21-bit characters to '\U00xxxxxx' */
1762        else if (ch >= 0x10000) {
1763	    int offset = p - PyString_AS_STRING(repr);
1764
1765	    /* Resize the string if necessary */
1766	    if (offset + 12 > PyString_GET_SIZE(repr)) {
1767		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1768		    goto onError;
1769		p = PyString_AS_STRING(repr) + offset;
1770	    }
1771
1772            *p++ = '\\';
1773            *p++ = 'U';
1774            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1775            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1776            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1777            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1778            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1779            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1780            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1781            *p++ = hexdigit[ch & 0x0000000F];
1782	    continue;
1783        }
1784#endif
1785	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1786	else if (ch >= 0xD800 && ch < 0xDC00) {
1787	    Py_UNICODE ch2;
1788	    Py_UCS4 ucs;
1789
1790	    ch2 = *s++;
1791	    size--;
1792	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1793		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1794		*p++ = '\\';
1795		*p++ = 'U';
1796		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1797		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1798		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1799		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1800		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1801		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1802		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1803		*p++ = hexdigit[ucs & 0x0000000F];
1804		continue;
1805	    }
1806	    /* Fall through: isolated surrogates are copied as-is */
1807	    s--;
1808	    size++;
1809	}
1810
1811        /* Map 16-bit characters to '\uxxxx' */
1812        if (ch >= 256) {
1813            *p++ = '\\';
1814            *p++ = 'u';
1815            *p++ = hexdigit[(ch >> 12) & 0x000F];
1816            *p++ = hexdigit[(ch >> 8) & 0x000F];
1817            *p++ = hexdigit[(ch >> 4) & 0x000F];
1818            *p++ = hexdigit[ch & 0x000F];
1819        }
1820
1821        /* Map special whitespace to '\t', \n', '\r' */
1822        else if (ch == '\t') {
1823            *p++ = '\\';
1824            *p++ = 't';
1825        }
1826        else if (ch == '\n') {
1827            *p++ = '\\';
1828            *p++ = 'n';
1829        }
1830        else if (ch == '\r') {
1831            *p++ = '\\';
1832            *p++ = 'r';
1833        }
1834
1835        /* Map non-printable US ASCII to '\xhh' */
1836        else if (ch < ' ' || ch >= 128) {
1837            *p++ = '\\';
1838            *p++ = 'x';
1839            *p++ = hexdigit[(ch >> 4) & 0x000F];
1840            *p++ = hexdigit[ch & 0x000F];
1841        }
1842
1843        /* Copy everything else as-is */
1844        else
1845            *p++ = (char) ch;
1846    }
1847    if (quotes)
1848        *p++ = PyString_AS_STRING(repr)[1];
1849
1850    *p = '\0';
1851    if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1852	goto onError;
1853
1854    return repr;
1855
1856 onError:
1857    Py_DECREF(repr);
1858    return NULL;
1859}
1860
1861PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1862					int size)
1863{
1864    return unicodeescape_string(s, size, 0);
1865}
1866
1867PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1868{
1869    if (!PyUnicode_Check(unicode)) {
1870        PyErr_BadArgument();
1871        return NULL;
1872    }
1873    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1874					 PyUnicode_GET_SIZE(unicode));
1875}
1876
1877/* --- Raw Unicode Escape Codec ------------------------------------------- */
1878
1879PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1880					   int size,
1881					   const char *errors)
1882{
1883    PyUnicodeObject *v;
1884    Py_UNICODE *p, *buf;
1885    const char *end;
1886    const char *bs;
1887
1888    /* Escaped strings will always be longer than the resulting
1889       Unicode string, so we start with size here and then reduce the
1890       length after conversion to the true value. */
1891    v = _PyUnicode_New(size);
1892    if (v == NULL)
1893	goto onError;
1894    if (size == 0)
1895	return (PyObject *)v;
1896    p = buf = PyUnicode_AS_UNICODE(v);
1897    end = s + size;
1898    while (s < end) {
1899	unsigned char c;
1900	Py_UNICODE x;
1901	int i;
1902
1903	/* Non-escape characters are interpreted as Unicode ordinals */
1904	if (*s != '\\') {
1905	    *p++ = (unsigned char)*s++;
1906	    continue;
1907	}
1908
1909	/* \u-escapes are only interpreted iff the number of leading
1910	   backslashes if odd */
1911	bs = s;
1912	for (;s < end;) {
1913	    if (*s != '\\')
1914		break;
1915	    *p++ = (unsigned char)*s++;
1916	}
1917	if (((s - bs) & 1) == 0 ||
1918	    s >= end ||
1919	    *s != 'u') {
1920	    continue;
1921	}
1922	p--;
1923	s++;
1924
1925	/* \uXXXX with 4 hex digits */
1926	for (x = 0, i = 0; i < 4; i++) {
1927	    c = (unsigned char)s[i];
1928	    if (!isxdigit(c)) {
1929		if (unicodeescape_decoding_error(&s, &x, errors,
1930						 "truncated \\uXXXX"))
1931		    goto onError;
1932		i++;
1933		break;
1934	    }
1935	    x = (x<<4) & ~0xF;
1936	    if (c >= '0' && c <= '9')
1937		x += c - '0';
1938	    else if (c >= 'a' && c <= 'f')
1939		x += 10 + c - 'a';
1940	    else
1941		x += 10 + c - 'A';
1942	}
1943	s += i;
1944	*p++ = x;
1945    }
1946    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1947	goto onError;
1948    return (PyObject *)v;
1949
1950 onError:
1951    Py_XDECREF(v);
1952    return NULL;
1953}
1954
1955PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1956					   int size)
1957{
1958    PyObject *repr;
1959    char *p;
1960    char *q;
1961
1962    static const char *hexdigit = "0123456789abcdef";
1963
1964    repr = PyString_FromStringAndSize(NULL, 6 * size);
1965    if (repr == NULL)
1966        return NULL;
1967    if (size == 0)
1968	return repr;
1969
1970    p = q = PyString_AS_STRING(repr);
1971    while (size-- > 0) {
1972        Py_UNICODE ch = *s++;
1973	/* Map 16-bit characters to '\uxxxx' */
1974	if (ch >= 256) {
1975            *p++ = '\\';
1976            *p++ = 'u';
1977            *p++ = hexdigit[(ch >> 12) & 0xf];
1978            *p++ = hexdigit[(ch >> 8) & 0xf];
1979            *p++ = hexdigit[(ch >> 4) & 0xf];
1980            *p++ = hexdigit[ch & 15];
1981        }
1982	/* Copy everything else as-is */
1983	else
1984            *p++ = (char) ch;
1985    }
1986    *p = '\0';
1987    if (_PyString_Resize(&repr, p - q))
1988	goto onError;
1989
1990    return repr;
1991
1992 onError:
1993    Py_DECREF(repr);
1994    return NULL;
1995}
1996
1997PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1998{
1999    if (!PyUnicode_Check(unicode)) {
2000	PyErr_BadArgument();
2001	return NULL;
2002    }
2003    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2004					    PyUnicode_GET_SIZE(unicode));
2005}
2006
2007/* --- Latin-1 Codec ------------------------------------------------------ */
2008
2009PyObject *PyUnicode_DecodeLatin1(const char *s,
2010				 int size,
2011				 const char *errors)
2012{
2013    PyUnicodeObject *v;
2014    Py_UNICODE *p;
2015
2016    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2017    if (size == 1 && *(unsigned char*)s < 256) {
2018	Py_UNICODE r = *(unsigned char*)s;
2019	return PyUnicode_FromUnicode(&r, 1);
2020    }
2021
2022    v = _PyUnicode_New(size);
2023    if (v == NULL)
2024	goto onError;
2025    if (size == 0)
2026	return (PyObject *)v;
2027    p = PyUnicode_AS_UNICODE(v);
2028    while (size-- > 0)
2029	*p++ = (unsigned char)*s++;
2030    return (PyObject *)v;
2031
2032 onError:
2033    Py_XDECREF(v);
2034    return NULL;
2035}
2036
2037static
2038int latin1_encoding_error(const Py_UNICODE **source,
2039			  char **dest,
2040			  const char *errors,
2041			  const char *details)
2042{
2043    if ((errors == NULL) ||
2044	(strcmp(errors,"strict") == 0)) {
2045	PyErr_Format(PyExc_UnicodeError,
2046		     "Latin-1 encoding error: %.400s",
2047		     details);
2048	return -1;
2049    }
2050    else if (strcmp(errors,"ignore") == 0) {
2051	return 0;
2052    }
2053    else if (strcmp(errors,"replace") == 0) {
2054	**dest = '?';
2055	(*dest)++;
2056	return 0;
2057    }
2058    else {
2059	PyErr_Format(PyExc_ValueError,
2060		     "Latin-1 encoding error; "
2061		     "unknown error handling code: %.400s",
2062		     errors);
2063	return -1;
2064    }
2065}
2066
2067PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2068				 int size,
2069				 const char *errors)
2070{
2071    PyObject *repr;
2072    char *s, *start;
2073
2074    repr = PyString_FromStringAndSize(NULL, size);
2075    if (repr == NULL)
2076        return NULL;
2077    if (size == 0)
2078	return repr;
2079
2080    s = PyString_AS_STRING(repr);
2081    start = s;
2082    while (size-- > 0) {
2083        Py_UNICODE ch = *p++;
2084	if (ch >= 256) {
2085	    if (latin1_encoding_error(&p, &s, errors,
2086				      "ordinal not in range(256)"))
2087		goto onError;
2088	}
2089	else
2090            *s++ = (char)ch;
2091    }
2092    /* Resize if error handling skipped some characters */
2093    if (s - start < PyString_GET_SIZE(repr))
2094	if (_PyString_Resize(&repr, s - start))
2095	    goto onError;
2096    return repr;
2097
2098 onError:
2099    Py_DECREF(repr);
2100    return NULL;
2101}
2102
2103PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2104{
2105    if (!PyUnicode_Check(unicode)) {
2106	PyErr_BadArgument();
2107	return NULL;
2108    }
2109    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2110				  PyUnicode_GET_SIZE(unicode),
2111				  NULL);
2112}
2113
2114/* --- 7-bit ASCII Codec -------------------------------------------------- */
2115
2116static
2117int ascii_decoding_error(const char **source,
2118			 Py_UNICODE **dest,
2119			 const char *errors,
2120			 const char *details)
2121{
2122    if ((errors == NULL) ||
2123	(strcmp(errors,"strict") == 0)) {
2124	PyErr_Format(PyExc_UnicodeError,
2125		     "ASCII decoding error: %.400s",
2126		     details);
2127	return -1;
2128    }
2129    else if (strcmp(errors,"ignore") == 0) {
2130	return 0;
2131    }
2132    else if (strcmp(errors,"replace") == 0) {
2133	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2134	(*dest)++;
2135	return 0;
2136    }
2137    else {
2138	PyErr_Format(PyExc_ValueError,
2139		     "ASCII decoding error; "
2140		     "unknown error handling code: %.400s",
2141		     errors);
2142	return -1;
2143    }
2144}
2145
2146PyObject *PyUnicode_DecodeASCII(const char *s,
2147				int size,
2148				const char *errors)
2149{
2150    PyUnicodeObject *v;
2151    Py_UNICODE *p;
2152
2153    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2154    if (size == 1 && *(unsigned char*)s < 128) {
2155	Py_UNICODE r = *(unsigned char*)s;
2156	return PyUnicode_FromUnicode(&r, 1);
2157    }
2158
2159    v = _PyUnicode_New(size);
2160    if (v == NULL)
2161	goto onError;
2162    if (size == 0)
2163	return (PyObject *)v;
2164    p = PyUnicode_AS_UNICODE(v);
2165    while (size-- > 0) {
2166	register unsigned char c;
2167
2168	c = (unsigned char)*s++;
2169	if (c < 128)
2170	    *p++ = c;
2171	else if (ascii_decoding_error(&s, &p, errors,
2172				      "ordinal not in range(128)"))
2173		goto onError;
2174    }
2175    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2176	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2177	    goto onError;
2178    return (PyObject *)v;
2179
2180 onError:
2181    Py_XDECREF(v);
2182    return NULL;
2183}
2184
2185static
2186int ascii_encoding_error(const Py_UNICODE **source,
2187			 char **dest,
2188			 const char *errors,
2189			 const char *details)
2190{
2191    if ((errors == NULL) ||
2192	(strcmp(errors,"strict") == 0)) {
2193	PyErr_Format(PyExc_UnicodeError,
2194		     "ASCII encoding error: %.400s",
2195		     details);
2196	return -1;
2197    }
2198    else if (strcmp(errors,"ignore") == 0) {
2199	return 0;
2200    }
2201    else if (strcmp(errors,"replace") == 0) {
2202	**dest = '?';
2203	(*dest)++;
2204	return 0;
2205    }
2206    else {
2207	PyErr_Format(PyExc_ValueError,
2208		     "ASCII encoding error; "
2209		     "unknown error handling code: %.400s",
2210		     errors);
2211	return -1;
2212    }
2213}
2214
2215PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2216				int size,
2217				const char *errors)
2218{
2219    PyObject *repr;
2220    char *s, *start;
2221
2222    repr = PyString_FromStringAndSize(NULL, size);
2223    if (repr == NULL)
2224        return NULL;
2225    if (size == 0)
2226	return repr;
2227
2228    s = PyString_AS_STRING(repr);
2229    start = s;
2230    while (size-- > 0) {
2231        Py_UNICODE ch = *p++;
2232	if (ch >= 128) {
2233	    if (ascii_encoding_error(&p, &s, errors,
2234				      "ordinal not in range(128)"))
2235		goto onError;
2236	}
2237	else
2238            *s++ = (char)ch;
2239    }
2240    /* Resize if error handling skipped some characters */
2241    if (s - start < PyString_GET_SIZE(repr))
2242	if (_PyString_Resize(&repr, s - start))
2243	    goto onError;
2244    return repr;
2245
2246 onError:
2247    Py_DECREF(repr);
2248    return NULL;
2249}
2250
2251PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2252{
2253    if (!PyUnicode_Check(unicode)) {
2254	PyErr_BadArgument();
2255	return NULL;
2256    }
2257    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2258				 PyUnicode_GET_SIZE(unicode),
2259				 NULL);
2260}
2261
2262#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2263
2264/* --- MBCS codecs for Windows -------------------------------------------- */
2265
2266PyObject *PyUnicode_DecodeMBCS(const char *s,
2267				int size,
2268				const char *errors)
2269{
2270    PyUnicodeObject *v;
2271    Py_UNICODE *p;
2272
2273    /* First get the size of the result */
2274    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2275    if (size > 0 && usize==0)
2276        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2277
2278    v = _PyUnicode_New(usize);
2279    if (v == NULL)
2280        return NULL;
2281    if (usize == 0)
2282	return (PyObject *)v;
2283    p = PyUnicode_AS_UNICODE(v);
2284    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2285        Py_DECREF(v);
2286        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2287    }
2288
2289    return (PyObject *)v;
2290}
2291
2292PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2293				int size,
2294				const char *errors)
2295{
2296    PyObject *repr;
2297    char *s;
2298    DWORD mbcssize;
2299
2300    /* If there are no characters, bail now! */
2301    if (size==0)
2302	    return PyString_FromString("");
2303
2304    /* First get the size of the result */
2305    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2306    if (mbcssize==0)
2307        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2308
2309    repr = PyString_FromStringAndSize(NULL, mbcssize);
2310    if (repr == NULL)
2311        return NULL;
2312    if (mbcssize == 0)
2313        return repr;
2314
2315    /* Do the conversion */
2316    s = PyString_AS_STRING(repr);
2317    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2318        Py_DECREF(repr);
2319        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2320    }
2321    return repr;
2322}
2323
2324#endif /* MS_WIN32 */
2325
2326/* --- Character Mapping Codec -------------------------------------------- */
2327
2328static
2329int charmap_decoding_error(const char **source,
2330			 Py_UNICODE **dest,
2331			 const char *errors,
2332			 const char *details)
2333{
2334    if ((errors == NULL) ||
2335	(strcmp(errors,"strict") == 0)) {
2336	PyErr_Format(PyExc_UnicodeError,
2337		     "charmap decoding error: %.400s",
2338		     details);
2339	return -1;
2340    }
2341    else if (strcmp(errors,"ignore") == 0) {
2342	return 0;
2343    }
2344    else if (strcmp(errors,"replace") == 0) {
2345	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2346	(*dest)++;
2347	return 0;
2348    }
2349    else {
2350	PyErr_Format(PyExc_ValueError,
2351		     "charmap decoding error; "
2352		     "unknown error handling code: %.400s",
2353		     errors);
2354	return -1;
2355    }
2356}
2357
2358PyObject *PyUnicode_DecodeCharmap(const char *s,
2359				  int size,
2360				  PyObject *mapping,
2361				  const char *errors)
2362{
2363    PyUnicodeObject *v;
2364    Py_UNICODE *p;
2365    int extrachars = 0;
2366
2367    /* Default to Latin-1 */
2368    if (mapping == NULL)
2369	return PyUnicode_DecodeLatin1(s, size, errors);
2370
2371    v = _PyUnicode_New(size);
2372    if (v == NULL)
2373	goto onError;
2374    if (size == 0)
2375	return (PyObject *)v;
2376    p = PyUnicode_AS_UNICODE(v);
2377    while (size-- > 0) {
2378	unsigned char ch = *s++;
2379	PyObject *w, *x;
2380
2381	/* Get mapping (char ordinal -> integer, Unicode char or None) */
2382	w = PyInt_FromLong((long)ch);
2383	if (w == NULL)
2384	    goto onError;
2385	x = PyObject_GetItem(mapping, w);
2386	Py_DECREF(w);
2387	if (x == NULL) {
2388	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2389		/* No mapping found means: mapping is undefined. */
2390		PyErr_Clear();
2391		x = Py_None;
2392		Py_INCREF(x);
2393	    } else
2394		goto onError;
2395	}
2396
2397	/* Apply mapping */
2398	if (PyInt_Check(x)) {
2399	    long value = PyInt_AS_LONG(x);
2400	    if (value < 0 || value > 65535) {
2401		PyErr_SetString(PyExc_TypeError,
2402				"character mapping must be in range(65536)");
2403		Py_DECREF(x);
2404		goto onError;
2405	    }
2406	    *p++ = (Py_UNICODE)value;
2407	}
2408	else if (x == Py_None) {
2409	    /* undefined mapping */
2410	    if (charmap_decoding_error(&s, &p, errors,
2411				       "character maps to <undefined>")) {
2412		Py_DECREF(x);
2413		goto onError;
2414	    }
2415	}
2416	else if (PyUnicode_Check(x)) {
2417	    int targetsize = PyUnicode_GET_SIZE(x);
2418
2419	    if (targetsize == 1)
2420		/* 1-1 mapping */
2421		*p++ = *PyUnicode_AS_UNICODE(x);
2422
2423	    else if (targetsize > 1) {
2424		/* 1-n mapping */
2425		if (targetsize > extrachars) {
2426		    /* resize first */
2427		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2428		    int needed = (targetsize - extrachars) + \
2429			         (targetsize << 2);
2430		    extrachars += needed;
2431		    if (_PyUnicode_Resize(&v,
2432					 PyUnicode_GET_SIZE(v) + needed)) {
2433			Py_DECREF(x);
2434			goto onError;
2435		    }
2436		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2437		}
2438		Py_UNICODE_COPY(p,
2439				PyUnicode_AS_UNICODE(x),
2440				targetsize);
2441		p += targetsize;
2442		extrachars -= targetsize;
2443	    }
2444	    /* 1-0 mapping: skip the character */
2445	}
2446	else {
2447	    /* wrong return value */
2448	    PyErr_SetString(PyExc_TypeError,
2449		  "character mapping must return integer, None or unicode");
2450	    Py_DECREF(x);
2451	    goto onError;
2452	}
2453	Py_DECREF(x);
2454    }
2455    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2456	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2457	    goto onError;
2458    return (PyObject *)v;
2459
2460 onError:
2461    Py_XDECREF(v);
2462    return NULL;
2463}
2464
2465static
2466int charmap_encoding_error(const Py_UNICODE **source,
2467			   char **dest,
2468			   const char *errors,
2469			   const char *details)
2470{
2471    if ((errors == NULL) ||
2472	(strcmp(errors,"strict") == 0)) {
2473	PyErr_Format(PyExc_UnicodeError,
2474		     "charmap encoding error: %.400s",
2475		     details);
2476	return -1;
2477    }
2478    else if (strcmp(errors,"ignore") == 0) {
2479	return 0;
2480    }
2481    else if (strcmp(errors,"replace") == 0) {
2482	**dest = '?';
2483	(*dest)++;
2484	return 0;
2485    }
2486    else {
2487	PyErr_Format(PyExc_ValueError,
2488		     "charmap encoding error; "
2489		     "unknown error handling code: %.400s",
2490		     errors);
2491	return -1;
2492    }
2493}
2494
2495PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2496				  int size,
2497				  PyObject *mapping,
2498				  const char *errors)
2499{
2500    PyObject *v;
2501    char *s;
2502    int extrachars = 0;
2503
2504    /* Default to Latin-1 */
2505    if (mapping == NULL)
2506	return PyUnicode_EncodeLatin1(p, size, errors);
2507
2508    v = PyString_FromStringAndSize(NULL, size);
2509    if (v == NULL)
2510        return NULL;
2511    if (size == 0)
2512	return v;
2513    s = PyString_AS_STRING(v);
2514    while (size-- > 0) {
2515	Py_UNICODE ch = *p++;
2516	PyObject *w, *x;
2517
2518	/* Get mapping (Unicode ordinal -> string char, integer or None) */
2519	w = PyInt_FromLong((long)ch);
2520	if (w == NULL)
2521	    goto onError;
2522	x = PyObject_GetItem(mapping, w);
2523	Py_DECREF(w);
2524	if (x == NULL) {
2525	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2526		/* No mapping found means: mapping is undefined. */
2527		PyErr_Clear();
2528		x = Py_None;
2529		Py_INCREF(x);
2530	    } else
2531		goto onError;
2532	}
2533
2534	/* Apply mapping */
2535	if (PyInt_Check(x)) {
2536	    long value = PyInt_AS_LONG(x);
2537	    if (value < 0 || value > 255) {
2538		PyErr_SetString(PyExc_TypeError,
2539				"character mapping must be in range(256)");
2540		Py_DECREF(x);
2541		goto onError;
2542	    }
2543	    *s++ = (char)value;
2544	}
2545	else if (x == Py_None) {
2546	    /* undefined mapping */
2547	    if (charmap_encoding_error(&p, &s, errors,
2548				       "character maps to <undefined>")) {
2549		Py_DECREF(x);
2550		goto onError;
2551	    }
2552	}
2553	else if (PyString_Check(x)) {
2554	    int targetsize = PyString_GET_SIZE(x);
2555
2556	    if (targetsize == 1)
2557		/* 1-1 mapping */
2558		*s++ = *PyString_AS_STRING(x);
2559
2560	    else if (targetsize > 1) {
2561		/* 1-n mapping */
2562		if (targetsize > extrachars) {
2563		    /* resize first */
2564		    int oldpos = (int)(s - PyString_AS_STRING(v));
2565		    int needed = (targetsize - extrachars) + \
2566			         (targetsize << 2);
2567		    extrachars += needed;
2568		    if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2569			Py_DECREF(x);
2570			goto onError;
2571		    }
2572		    s = PyString_AS_STRING(v) + oldpos;
2573		}
2574		memcpy(s, PyString_AS_STRING(x), targetsize);
2575		s += targetsize;
2576		extrachars -= targetsize;
2577	    }
2578	    /* 1-0 mapping: skip the character */
2579	}
2580	else {
2581	    /* wrong return value */
2582	    PyErr_SetString(PyExc_TypeError,
2583		  "character mapping must return integer, None or unicode");
2584	    Py_DECREF(x);
2585	    goto onError;
2586	}
2587	Py_DECREF(x);
2588    }
2589    if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2590	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2591	    goto onError;
2592    return v;
2593
2594 onError:
2595    Py_DECREF(v);
2596    return NULL;
2597}
2598
2599PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2600				    PyObject *mapping)
2601{
2602    if (!PyUnicode_Check(unicode) || mapping == NULL) {
2603	PyErr_BadArgument();
2604	return NULL;
2605    }
2606    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2607				   PyUnicode_GET_SIZE(unicode),
2608				   mapping,
2609				   NULL);
2610}
2611
2612static
2613int translate_error(const Py_UNICODE **source,
2614		    Py_UNICODE **dest,
2615		    const char *errors,
2616		    const char *details)
2617{
2618    if ((errors == NULL) ||
2619	(strcmp(errors,"strict") == 0)) {
2620	PyErr_Format(PyExc_UnicodeError,
2621		     "translate error: %.400s",
2622		     details);
2623	return -1;
2624    }
2625    else if (strcmp(errors,"ignore") == 0) {
2626	return 0;
2627    }
2628    else if (strcmp(errors,"replace") == 0) {
2629	**dest = '?';
2630	(*dest)++;
2631	return 0;
2632    }
2633    else {
2634	PyErr_Format(PyExc_ValueError,
2635		     "translate error; "
2636		     "unknown error handling code: %.400s",
2637		     errors);
2638	return -1;
2639    }
2640}
2641
2642PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2643				     int size,
2644				     PyObject *mapping,
2645				     const char *errors)
2646{
2647    PyUnicodeObject *v;
2648    Py_UNICODE *p;
2649
2650    if (mapping == NULL) {
2651	PyErr_BadArgument();
2652	return NULL;
2653    }
2654
2655    /* Output will never be longer than input */
2656    v = _PyUnicode_New(size);
2657    if (v == NULL)
2658	goto onError;
2659    if (size == 0)
2660	goto done;
2661    p = PyUnicode_AS_UNICODE(v);
2662    while (size-- > 0) {
2663	Py_UNICODE ch = *s++;
2664	PyObject *w, *x;
2665
2666	/* Get mapping */
2667	w = PyInt_FromLong(ch);
2668	if (w == NULL)
2669	    goto onError;
2670	x = PyObject_GetItem(mapping, w);
2671	Py_DECREF(w);
2672	if (x == NULL) {
2673	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2674		/* No mapping found: default to 1-1 mapping */
2675		PyErr_Clear();
2676		*p++ = ch;
2677		continue;
2678	    }
2679	    goto onError;
2680	}
2681
2682	/* Apply mapping */
2683	if (PyInt_Check(x))
2684	    *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2685	else if (x == Py_None) {
2686	    /* undefined mapping */
2687	    if (translate_error(&s, &p, errors,
2688				"character maps to <undefined>")) {
2689		Py_DECREF(x);
2690		goto onError;
2691	    }
2692	}
2693	else if (PyUnicode_Check(x)) {
2694	    if (PyUnicode_GET_SIZE(x) != 1) {
2695		/* 1-n mapping */
2696		PyErr_SetString(PyExc_NotImplementedError,
2697				"1-n mappings are currently not implemented");
2698		Py_DECREF(x);
2699		goto onError;
2700	    }
2701	    *p++ = *PyUnicode_AS_UNICODE(x);
2702	}
2703	else {
2704	    /* wrong return value */
2705	    PyErr_SetString(PyExc_TypeError,
2706		  "translate mapping must return integer, None or unicode");
2707	    Py_DECREF(x);
2708	    goto onError;
2709	}
2710	Py_DECREF(x);
2711    }
2712    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2713	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2714	    goto onError;
2715
2716 done:
2717    return (PyObject *)v;
2718
2719 onError:
2720    Py_XDECREF(v);
2721    return NULL;
2722}
2723
2724PyObject *PyUnicode_Translate(PyObject *str,
2725			      PyObject *mapping,
2726			      const char *errors)
2727{
2728    PyObject *result;
2729
2730    str = PyUnicode_FromObject(str);
2731    if (str == NULL)
2732	goto onError;
2733    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2734					PyUnicode_GET_SIZE(str),
2735					mapping,
2736					errors);
2737    Py_DECREF(str);
2738    return result;
2739
2740 onError:
2741    Py_XDECREF(str);
2742    return NULL;
2743}
2744
2745/* --- Decimal Encoder ---------------------------------------------------- */
2746
2747int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2748			    int length,
2749			    char *output,
2750			    const char *errors)
2751{
2752    Py_UNICODE *p, *end;
2753
2754    if (output == NULL) {
2755	PyErr_BadArgument();
2756	return -1;
2757    }
2758
2759    p = s;
2760    end = s + length;
2761    while (p < end) {
2762	register Py_UNICODE ch = *p++;
2763	int decimal;
2764
2765	if (Py_UNICODE_ISSPACE(ch)) {
2766	    *output++ = ' ';
2767	    continue;
2768	}
2769	decimal = Py_UNICODE_TODECIMAL(ch);
2770	if (decimal >= 0) {
2771	    *output++ = '0' + decimal;
2772	    continue;
2773	}
2774	if (0 < ch && ch < 256) {
2775	    *output++ = (char)ch;
2776	    continue;
2777	}
2778	/* All other characters are considered invalid */
2779	if (errors == NULL || strcmp(errors, "strict") == 0) {
2780	    PyErr_SetString(PyExc_ValueError,
2781			    "invalid decimal Unicode string");
2782	    goto onError;
2783	}
2784	else if (strcmp(errors, "ignore") == 0)
2785	    continue;
2786	else if (strcmp(errors, "replace") == 0) {
2787	    *output++ = '?';
2788	    continue;
2789	}
2790    }
2791    /* 0-terminate the output string */
2792    *output++ = '\0';
2793    return 0;
2794
2795 onError:
2796    return -1;
2797}
2798
2799/* --- Helpers ------------------------------------------------------------ */
2800
2801static
2802int count(PyUnicodeObject *self,
2803	  int start,
2804	  int end,
2805	  PyUnicodeObject *substring)
2806{
2807    int count = 0;
2808
2809    if (start < 0)
2810        start += self->length;
2811    if (start < 0)
2812        start = 0;
2813    if (end > self->length)
2814        end = self->length;
2815    if (end < 0)
2816        end += self->length;
2817    if (end < 0)
2818        end = 0;
2819
2820    if (substring->length == 0)
2821	return (end - start + 1);
2822
2823    end -= substring->length;
2824
2825    while (start <= end)
2826        if (Py_UNICODE_MATCH(self, start, substring)) {
2827            count++;
2828            start += substring->length;
2829        } else
2830            start++;
2831
2832    return count;
2833}
2834
2835int PyUnicode_Count(PyObject *str,
2836		    PyObject *substr,
2837		    int start,
2838		    int end)
2839{
2840    int result;
2841
2842    str = PyUnicode_FromObject(str);
2843    if (str == NULL)
2844	return -1;
2845    substr = PyUnicode_FromObject(substr);
2846    if (substr == NULL) {
2847	Py_DECREF(str);
2848	return -1;
2849    }
2850
2851    result = count((PyUnicodeObject *)str,
2852		   start, end,
2853		   (PyUnicodeObject *)substr);
2854
2855    Py_DECREF(str);
2856    Py_DECREF(substr);
2857    return result;
2858}
2859
2860static
2861int findstring(PyUnicodeObject *self,
2862	       PyUnicodeObject *substring,
2863	       int start,
2864	       int end,
2865	       int direction)
2866{
2867    if (start < 0)
2868        start += self->length;
2869    if (start < 0)
2870        start = 0;
2871
2872    if (substring->length == 0)
2873        return start;
2874
2875    if (end > self->length)
2876        end = self->length;
2877    if (end < 0)
2878        end += self->length;
2879    if (end < 0)
2880        end = 0;
2881
2882    end -= substring->length;
2883
2884    if (direction < 0) {
2885        for (; end >= start; end--)
2886            if (Py_UNICODE_MATCH(self, end, substring))
2887                return end;
2888    } else {
2889        for (; start <= end; start++)
2890            if (Py_UNICODE_MATCH(self, start, substring))
2891                return start;
2892    }
2893
2894    return -1;
2895}
2896
2897int PyUnicode_Find(PyObject *str,
2898		   PyObject *substr,
2899		   int start,
2900		   int end,
2901		   int direction)
2902{
2903    int result;
2904
2905    str = PyUnicode_FromObject(str);
2906    if (str == NULL)
2907	return -1;
2908    substr = PyUnicode_FromObject(substr);
2909    if (substr == NULL) {
2910	Py_DECREF(substr);
2911	return -1;
2912    }
2913
2914    result = findstring((PyUnicodeObject *)str,
2915			(PyUnicodeObject *)substr,
2916			start, end, direction);
2917    Py_DECREF(str);
2918    Py_DECREF(substr);
2919    return result;
2920}
2921
2922static
2923int tailmatch(PyUnicodeObject *self,
2924	      PyUnicodeObject *substring,
2925	      int start,
2926	      int end,
2927	      int direction)
2928{
2929    if (start < 0)
2930        start += self->length;
2931    if (start < 0)
2932        start = 0;
2933
2934    if (substring->length == 0)
2935        return 1;
2936
2937    if (end > self->length)
2938        end = self->length;
2939    if (end < 0)
2940        end += self->length;
2941    if (end < 0)
2942        end = 0;
2943
2944    end -= substring->length;
2945    if (end < start)
2946	return 0;
2947
2948    if (direction > 0) {
2949	if (Py_UNICODE_MATCH(self, end, substring))
2950	    return 1;
2951    } else {
2952        if (Py_UNICODE_MATCH(self, start, substring))
2953	    return 1;
2954    }
2955
2956    return 0;
2957}
2958
2959int PyUnicode_Tailmatch(PyObject *str,
2960			PyObject *substr,
2961			int start,
2962			int end,
2963			int direction)
2964{
2965    int result;
2966
2967    str = PyUnicode_FromObject(str);
2968    if (str == NULL)
2969	return -1;
2970    substr = PyUnicode_FromObject(substr);
2971    if (substr == NULL) {
2972	Py_DECREF(substr);
2973	return -1;
2974    }
2975
2976    result = tailmatch((PyUnicodeObject *)str,
2977		       (PyUnicodeObject *)substr,
2978		       start, end, direction);
2979    Py_DECREF(str);
2980    Py_DECREF(substr);
2981    return result;
2982}
2983
2984static
2985const Py_UNICODE *findchar(const Py_UNICODE *s,
2986		     int size,
2987		     Py_UNICODE ch)
2988{
2989    /* like wcschr, but doesn't stop at NULL characters */
2990
2991    while (size-- > 0) {
2992        if (*s == ch)
2993            return s;
2994        s++;
2995    }
2996
2997    return NULL;
2998}
2999
3000/* Apply fixfct filter to the Unicode object self and return a
3001   reference to the modified object */
3002
3003static
3004PyObject *fixup(PyUnicodeObject *self,
3005		int (*fixfct)(PyUnicodeObject *s))
3006{
3007
3008    PyUnicodeObject *u;
3009
3010    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3011    if (u == NULL)
3012	return NULL;
3013
3014    Py_UNICODE_COPY(u->str, self->str, self->length);
3015
3016    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3017	/* fixfct should return TRUE if it modified the buffer. If
3018	   FALSE, return a reference to the original buffer instead
3019	   (to save space, not time) */
3020	Py_INCREF(self);
3021	Py_DECREF(u);
3022	return (PyObject*) self;
3023    }
3024    return (PyObject*) u;
3025}
3026
3027static
3028int fixupper(PyUnicodeObject *self)
3029{
3030    int len = self->length;
3031    Py_UNICODE *s = self->str;
3032    int status = 0;
3033
3034    while (len-- > 0) {
3035	register Py_UNICODE ch;
3036
3037	ch = Py_UNICODE_TOUPPER(*s);
3038	if (ch != *s) {
3039            status = 1;
3040	    *s = ch;
3041	}
3042        s++;
3043    }
3044
3045    return status;
3046}
3047
3048static
3049int fixlower(PyUnicodeObject *self)
3050{
3051    int len = self->length;
3052    Py_UNICODE *s = self->str;
3053    int status = 0;
3054
3055    while (len-- > 0) {
3056	register Py_UNICODE ch;
3057
3058	ch = Py_UNICODE_TOLOWER(*s);
3059	if (ch != *s) {
3060            status = 1;
3061	    *s = ch;
3062	}
3063        s++;
3064    }
3065
3066    return status;
3067}
3068
3069static
3070int fixswapcase(PyUnicodeObject *self)
3071{
3072    int len = self->length;
3073    Py_UNICODE *s = self->str;
3074    int status = 0;
3075
3076    while (len-- > 0) {
3077        if (Py_UNICODE_ISUPPER(*s)) {
3078            *s = Py_UNICODE_TOLOWER(*s);
3079            status = 1;
3080        } else if (Py_UNICODE_ISLOWER(*s)) {
3081            *s = Py_UNICODE_TOUPPER(*s);
3082            status = 1;
3083        }
3084        s++;
3085    }
3086
3087    return status;
3088}
3089
3090static
3091int fixcapitalize(PyUnicodeObject *self)
3092{
3093    int len = self->length;
3094    Py_UNICODE *s = self->str;
3095    int status = 0;
3096
3097    if (len == 0)
3098	return 0;
3099    if (Py_UNICODE_ISLOWER(*s)) {
3100	*s = Py_UNICODE_TOUPPER(*s);
3101	status = 1;
3102    }
3103    s++;
3104    while (--len > 0) {
3105        if (Py_UNICODE_ISUPPER(*s)) {
3106            *s = Py_UNICODE_TOLOWER(*s);
3107            status = 1;
3108        }
3109        s++;
3110    }
3111    return status;
3112}
3113
3114static
3115int fixtitle(PyUnicodeObject *self)
3116{
3117    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3118    register Py_UNICODE *e;
3119    int previous_is_cased;
3120
3121    /* Shortcut for single character strings */
3122    if (PyUnicode_GET_SIZE(self) == 1) {
3123	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3124	if (*p != ch) {
3125	    *p = ch;
3126	    return 1;
3127	}
3128	else
3129	    return 0;
3130    }
3131
3132    e = p + PyUnicode_GET_SIZE(self);
3133    previous_is_cased = 0;
3134    for (; p < e; p++) {
3135	register const Py_UNICODE ch = *p;
3136
3137	if (previous_is_cased)
3138	    *p = Py_UNICODE_TOLOWER(ch);
3139	else
3140	    *p = Py_UNICODE_TOTITLE(ch);
3141
3142	if (Py_UNICODE_ISLOWER(ch) ||
3143	    Py_UNICODE_ISUPPER(ch) ||
3144	    Py_UNICODE_ISTITLE(ch))
3145	    previous_is_cased = 1;
3146	else
3147	    previous_is_cased = 0;
3148    }
3149    return 1;
3150}
3151
3152PyObject *PyUnicode_Join(PyObject *separator,
3153			 PyObject *seq)
3154{
3155    Py_UNICODE *sep;
3156    int seplen;
3157    PyUnicodeObject *res = NULL;
3158    int reslen = 0;
3159    Py_UNICODE *p;
3160    int sz = 100;
3161    int i;
3162    PyObject *it;
3163
3164    it = PyObject_GetIter(seq);
3165    if (it == NULL)
3166        return NULL;
3167
3168    if (separator == NULL) {
3169	Py_UNICODE blank = ' ';
3170	sep = &blank;
3171	seplen = 1;
3172    }
3173    else {
3174	separator = PyUnicode_FromObject(separator);
3175	if (separator == NULL)
3176	    goto onError;
3177	sep = PyUnicode_AS_UNICODE(separator);
3178	seplen = PyUnicode_GET_SIZE(separator);
3179    }
3180
3181    res = _PyUnicode_New(sz);
3182    if (res == NULL)
3183	goto onError;
3184    p = PyUnicode_AS_UNICODE(res);
3185    reslen = 0;
3186
3187    for (i = 0; ; ++i) {
3188	int itemlen;
3189	PyObject *item = PyIter_Next(it);
3190	if (item == NULL) {
3191	    if (PyErr_Occurred())
3192		goto onError;
3193	    break;
3194	}
3195	if (!PyUnicode_Check(item)) {
3196	    PyObject *v;
3197	    if (!PyString_Check(item)) {
3198		PyErr_Format(PyExc_TypeError,
3199			     "sequence item %i: expected string or Unicode,"
3200			     " %.80s found",
3201			     i, item->ob_type->tp_name);
3202		Py_DECREF(item);
3203		goto onError;
3204	    }
3205	    v = PyUnicode_FromObject(item);
3206	    Py_DECREF(item);
3207	    item = v;
3208	    if (item == NULL)
3209		goto onError;
3210	}
3211	itemlen = PyUnicode_GET_SIZE(item);
3212	while (reslen + itemlen + seplen >= sz) {
3213	    if (_PyUnicode_Resize(&res, sz*2)) {
3214		Py_DECREF(item);
3215		goto onError;
3216	    }
3217	    sz *= 2;
3218	    p = PyUnicode_AS_UNICODE(res) + reslen;
3219	}
3220	if (i > 0) {
3221	    Py_UNICODE_COPY(p, sep, seplen);
3222	    p += seplen;
3223	    reslen += seplen;
3224	}
3225	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3226	p += itemlen;
3227	reslen += itemlen;
3228	Py_DECREF(item);
3229    }
3230    if (_PyUnicode_Resize(&res, reslen))
3231	goto onError;
3232
3233    Py_XDECREF(separator);
3234    Py_DECREF(it);
3235    return (PyObject *)res;
3236
3237 onError:
3238    Py_XDECREF(separator);
3239    Py_XDECREF(res);
3240    Py_DECREF(it);
3241    return NULL;
3242}
3243
3244static
3245PyUnicodeObject *pad(PyUnicodeObject *self,
3246		     int left,
3247		     int right,
3248		     Py_UNICODE fill)
3249{
3250    PyUnicodeObject *u;
3251
3252    if (left < 0)
3253        left = 0;
3254    if (right < 0)
3255        right = 0;
3256
3257    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3258        Py_INCREF(self);
3259        return self;
3260    }
3261
3262    u = _PyUnicode_New(left + self->length + right);
3263    if (u) {
3264        if (left)
3265            Py_UNICODE_FILL(u->str, fill, left);
3266        Py_UNICODE_COPY(u->str + left, self->str, self->length);
3267        if (right)
3268            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3269    }
3270
3271    return u;
3272}
3273
3274#define SPLIT_APPEND(data, left, right)					\
3275	str = PyUnicode_FromUnicode(data + left, right - left);		\
3276	if (!str)							\
3277	    goto onError;						\
3278	if (PyList_Append(list, str)) {					\
3279	    Py_DECREF(str);						\
3280	    goto onError;						\
3281	}								\
3282        else								\
3283            Py_DECREF(str);
3284
3285static
3286PyObject *split_whitespace(PyUnicodeObject *self,
3287			   PyObject *list,
3288			   int maxcount)
3289{
3290    register int i;
3291    register int j;
3292    int len = self->length;
3293    PyObject *str;
3294
3295    for (i = j = 0; i < len; ) {
3296	/* find a token */
3297	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3298	    i++;
3299	j = i;
3300	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3301	    i++;
3302	if (j < i) {
3303	    if (maxcount-- <= 0)
3304		break;
3305	    SPLIT_APPEND(self->str, j, i);
3306	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3307		i++;
3308	    j = i;
3309	}
3310    }
3311    if (j < len) {
3312	SPLIT_APPEND(self->str, j, len);
3313    }
3314    return list;
3315
3316 onError:
3317    Py_DECREF(list);
3318    return NULL;
3319}
3320
3321PyObject *PyUnicode_Splitlines(PyObject *string,
3322			       int keepends)
3323{
3324    register int i;
3325    register int j;
3326    int len;
3327    PyObject *list;
3328    PyObject *str;
3329    Py_UNICODE *data;
3330
3331    string = PyUnicode_FromObject(string);
3332    if (string == NULL)
3333	return NULL;
3334    data = PyUnicode_AS_UNICODE(string);
3335    len = PyUnicode_GET_SIZE(string);
3336
3337    list = PyList_New(0);
3338    if (!list)
3339        goto onError;
3340
3341    for (i = j = 0; i < len; ) {
3342	int eol;
3343
3344	/* Find a line and append it */
3345	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3346	    i++;
3347
3348	/* Skip the line break reading CRLF as one line break */
3349	eol = i;
3350	if (i < len) {
3351	    if (data[i] == '\r' && i + 1 < len &&
3352		data[i+1] == '\n')
3353		i += 2;
3354	    else
3355		i++;
3356	    if (keepends)
3357		eol = i;
3358	}
3359	SPLIT_APPEND(data, j, eol);
3360	j = i;
3361    }
3362    if (j < len) {
3363	SPLIT_APPEND(data, j, len);
3364    }
3365
3366    Py_DECREF(string);
3367    return list;
3368
3369 onError:
3370    Py_DECREF(list);
3371    Py_DECREF(string);
3372    return NULL;
3373}
3374
3375static
3376PyObject *split_char(PyUnicodeObject *self,
3377		     PyObject *list,
3378		     Py_UNICODE ch,
3379		     int maxcount)
3380{
3381    register int i;
3382    register int j;
3383    int len = self->length;
3384    PyObject *str;
3385
3386    for (i = j = 0; i < len; ) {
3387	if (self->str[i] == ch) {
3388	    if (maxcount-- <= 0)
3389		break;
3390	    SPLIT_APPEND(self->str, j, i);
3391	    i = j = i + 1;
3392	} else
3393	    i++;
3394    }
3395    if (j <= len) {
3396	SPLIT_APPEND(self->str, j, len);
3397    }
3398    return list;
3399
3400 onError:
3401    Py_DECREF(list);
3402    return NULL;
3403}
3404
3405static
3406PyObject *split_substring(PyUnicodeObject *self,
3407			  PyObject *list,
3408			  PyUnicodeObject *substring,
3409			  int maxcount)
3410{
3411    register int i;
3412    register int j;
3413    int len = self->length;
3414    int sublen = substring->length;
3415    PyObject *str;
3416
3417    for (i = j = 0; i <= len - sublen; ) {
3418	if (Py_UNICODE_MATCH(self, i, substring)) {
3419	    if (maxcount-- <= 0)
3420		break;
3421	    SPLIT_APPEND(self->str, j, i);
3422	    i = j = i + sublen;
3423	} else
3424	    i++;
3425    }
3426    if (j <= len) {
3427	SPLIT_APPEND(self->str, j, len);
3428    }
3429    return list;
3430
3431 onError:
3432    Py_DECREF(list);
3433    return NULL;
3434}
3435
3436#undef SPLIT_APPEND
3437
3438static
3439PyObject *split(PyUnicodeObject *self,
3440		PyUnicodeObject *substring,
3441		int maxcount)
3442{
3443    PyObject *list;
3444
3445    if (maxcount < 0)
3446        maxcount = INT_MAX;
3447
3448    list = PyList_New(0);
3449    if (!list)
3450        return NULL;
3451
3452    if (substring == NULL)
3453	return split_whitespace(self,list,maxcount);
3454
3455    else if (substring->length == 1)
3456	return split_char(self,list,substring->str[0],maxcount);
3457
3458    else if (substring->length == 0) {
3459	Py_DECREF(list);
3460	PyErr_SetString(PyExc_ValueError, "empty separator");
3461	return NULL;
3462    }
3463    else
3464	return split_substring(self,list,substring,maxcount);
3465}
3466
3467static
3468PyObject *strip(PyUnicodeObject *self,
3469		int left,
3470		int right)
3471{
3472    Py_UNICODE *p = self->str;
3473    int start = 0;
3474    int end = self->length;
3475
3476    if (left)
3477        while (start < end && Py_UNICODE_ISSPACE(p[start]))
3478            start++;
3479
3480    if (right)
3481        while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3482            end--;
3483
3484    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
3485        /* couldn't strip anything off, return original string */
3486        Py_INCREF(self);
3487        return (PyObject*) self;
3488    }
3489
3490    return (PyObject*) PyUnicode_FromUnicode(
3491        self->str + start,
3492        end - start
3493        );
3494}
3495
3496static
3497PyObject *replace(PyUnicodeObject *self,
3498		  PyUnicodeObject *str1,
3499		  PyUnicodeObject *str2,
3500		  int maxcount)
3501{
3502    PyUnicodeObject *u;
3503
3504    if (maxcount < 0)
3505	maxcount = INT_MAX;
3506
3507    if (str1->length == 1 && str2->length == 1) {
3508        int i;
3509
3510        /* replace characters */
3511        if (!findchar(self->str, self->length, str1->str[0]) &&
3512            PyUnicode_CheckExact(self)) {
3513            /* nothing to replace, return original string */
3514            Py_INCREF(self);
3515            u = self;
3516        } else {
3517	    Py_UNICODE u1 = str1->str[0];
3518	    Py_UNICODE u2 = str2->str[0];
3519
3520            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3521                NULL,
3522                self->length
3523                );
3524            if (u != NULL) {
3525		Py_UNICODE_COPY(u->str, self->str,
3526				self->length);
3527                for (i = 0; i < u->length; i++)
3528                    if (u->str[i] == u1) {
3529                        if (--maxcount < 0)
3530                            break;
3531                        u->str[i] = u2;
3532                    }
3533        }
3534        }
3535
3536    } else {
3537        int n, i;
3538        Py_UNICODE *p;
3539
3540        /* replace strings */
3541        n = count(self, 0, self->length, str1);
3542        if (n > maxcount)
3543            n = maxcount;
3544        if (n == 0 && PyUnicode_CheckExact(self)) {
3545            /* nothing to replace, return original string */
3546            Py_INCREF(self);
3547            u = self;
3548        } else {
3549            u = _PyUnicode_New(
3550                self->length + n * (str2->length - str1->length));
3551            if (u) {
3552                i = 0;
3553                p = u->str;
3554                while (i <= self->length - str1->length)
3555                    if (Py_UNICODE_MATCH(self, i, str1)) {
3556                        /* replace string segment */
3557                        Py_UNICODE_COPY(p, str2->str, str2->length);
3558                        p += str2->length;
3559                        i += str1->length;
3560                        if (--n <= 0) {
3561                            /* copy remaining part */
3562                            Py_UNICODE_COPY(p, self->str+i, self->length-i);
3563                            break;
3564                        }
3565                    } else
3566                        *p++ = self->str[i++];
3567            }
3568        }
3569    }
3570
3571    return (PyObject *) u;
3572}
3573
3574/* --- Unicode Object Methods --------------------------------------------- */
3575
3576static char title__doc__[] =
3577"S.title() -> unicode\n\
3578\n\
3579Return a titlecased version of S, i.e. words start with title case\n\
3580characters, all remaining cased characters have lower case.";
3581
3582static PyObject*
3583unicode_title(PyUnicodeObject *self)
3584{
3585    return fixup(self, fixtitle);
3586}
3587
3588static char capitalize__doc__[] =
3589"S.capitalize() -> unicode\n\
3590\n\
3591Return a capitalized version of S, i.e. make the first character\n\
3592have upper case.";
3593
3594static PyObject*
3595unicode_capitalize(PyUnicodeObject *self)
3596{
3597    return fixup(self, fixcapitalize);
3598}
3599
3600#if 0
3601static char capwords__doc__[] =
3602"S.capwords() -> unicode\n\
3603\n\
3604Apply .capitalize() to all words in S and return the result with\n\
3605normalized whitespace (all whitespace strings are replaced by ' ').";
3606
3607static PyObject*
3608unicode_capwords(PyUnicodeObject *self)
3609{
3610    PyObject *list;
3611    PyObject *item;
3612    int i;
3613
3614    /* Split into words */
3615    list = split(self, NULL, -1);
3616    if (!list)
3617        return NULL;
3618
3619    /* Capitalize each word */
3620    for (i = 0; i < PyList_GET_SIZE(list); i++) {
3621        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3622		     fixcapitalize);
3623        if (item == NULL)
3624            goto onError;
3625        Py_DECREF(PyList_GET_ITEM(list, i));
3626        PyList_SET_ITEM(list, i, item);
3627    }
3628
3629    /* Join the words to form a new string */
3630    item = PyUnicode_Join(NULL, list);
3631
3632onError:
3633    Py_DECREF(list);
3634    return (PyObject *)item;
3635}
3636#endif
3637
3638static char center__doc__[] =
3639"S.center(width) -> unicode\n\
3640\n\
3641Return S centered in a Unicode string of length width. Padding is done\n\
3642using spaces.";
3643
3644static PyObject *
3645unicode_center(PyUnicodeObject *self, PyObject *args)
3646{
3647    int marg, left;
3648    int width;
3649
3650    if (!PyArg_ParseTuple(args, "i:center", &width))
3651        return NULL;
3652
3653    if (self->length >= width && PyUnicode_CheckExact(self)) {
3654        Py_INCREF(self);
3655        return (PyObject*) self;
3656    }
3657
3658    marg = width - self->length;
3659    left = marg / 2 + (marg & width & 1);
3660
3661    return (PyObject*) pad(self, left, marg - left, ' ');
3662}
3663
3664#if 0
3665
3666/* This code should go into some future Unicode collation support
3667   module. The basic comparison should compare ordinals on a naive
3668   basis (this is what Java does and thus JPython too). */
3669
3670/* speedy UTF-16 code point order comparison */
3671/* gleaned from: */
3672/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3673
3674static short utf16Fixup[32] =
3675{
3676    0, 0, 0, 0, 0, 0, 0, 0,
3677    0, 0, 0, 0, 0, 0, 0, 0,
3678    0, 0, 0, 0, 0, 0, 0, 0,
3679    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3680};
3681
3682static int
3683unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3684{
3685    int len1, len2;
3686
3687    Py_UNICODE *s1 = str1->str;
3688    Py_UNICODE *s2 = str2->str;
3689
3690    len1 = str1->length;
3691    len2 = str2->length;
3692
3693    while (len1 > 0 && len2 > 0) {
3694        Py_UNICODE c1, c2;
3695
3696        c1 = *s1++;
3697        c2 = *s2++;
3698
3699	if (c1 > (1<<11) * 26)
3700	    c1 += utf16Fixup[c1>>11];
3701	if (c2 > (1<<11) * 26)
3702            c2 += utf16Fixup[c2>>11];
3703        /* now c1 and c2 are in UTF-32-compatible order */
3704
3705        if (c1 != c2)
3706            return (c1 < c2) ? -1 : 1;
3707
3708        len1--; len2--;
3709    }
3710
3711    return (len1 < len2) ? -1 : (len1 != len2);
3712}
3713
3714#else
3715
3716static int
3717unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3718{
3719    register int len1, len2;
3720
3721    Py_UNICODE *s1 = str1->str;
3722    Py_UNICODE *s2 = str2->str;
3723
3724    len1 = str1->length;
3725    len2 = str2->length;
3726
3727    while (len1 > 0 && len2 > 0) {
3728        Py_UNICODE c1, c2;
3729
3730        c1 = *s1++;
3731        c2 = *s2++;
3732
3733        if (c1 != c2)
3734            return (c1 < c2) ? -1 : 1;
3735
3736        len1--; len2--;
3737    }
3738
3739    return (len1 < len2) ? -1 : (len1 != len2);
3740}
3741
3742#endif
3743
3744int PyUnicode_Compare(PyObject *left,
3745		      PyObject *right)
3746{
3747    PyUnicodeObject *u = NULL, *v = NULL;
3748    int result;
3749
3750    /* Coerce the two arguments */
3751    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3752    if (u == NULL)
3753	goto onError;
3754    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3755    if (v == NULL)
3756	goto onError;
3757
3758    /* Shortcut for empty or interned objects */
3759    if (v == u) {
3760	Py_DECREF(u);
3761	Py_DECREF(v);
3762	return 0;
3763    }
3764
3765    result = unicode_compare(u, v);
3766
3767    Py_DECREF(u);
3768    Py_DECREF(v);
3769    return result;
3770
3771onError:
3772    Py_XDECREF(u);
3773    Py_XDECREF(v);
3774    return -1;
3775}
3776
3777int PyUnicode_Contains(PyObject *container,
3778		       PyObject *element)
3779{
3780    PyUnicodeObject *u = NULL, *v = NULL;
3781    int result;
3782    register const Py_UNICODE *p, *e;
3783    register Py_UNICODE ch;
3784
3785    /* Coerce the two arguments */
3786    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3787    if (v == NULL) {
3788	PyErr_SetString(PyExc_TypeError,
3789	    "'in <string>' requires character as left operand");
3790	goto onError;
3791    }
3792    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3793    if (u == NULL) {
3794	Py_DECREF(v);
3795	goto onError;
3796    }
3797
3798    /* Check v in u */
3799    if (PyUnicode_GET_SIZE(v) != 1) {
3800	PyErr_SetString(PyExc_TypeError,
3801	    "'in <string>' requires character as left operand");
3802	goto onError;
3803    }
3804    ch = *PyUnicode_AS_UNICODE(v);
3805    p = PyUnicode_AS_UNICODE(u);
3806    e = p + PyUnicode_GET_SIZE(u);
3807    result = 0;
3808    while (p < e) {
3809	if (*p++ == ch) {
3810	    result = 1;
3811	    break;
3812	}
3813    }
3814
3815    Py_DECREF(u);
3816    Py_DECREF(v);
3817    return result;
3818
3819onError:
3820    Py_XDECREF(u);
3821    Py_XDECREF(v);
3822    return -1;
3823}
3824
3825/* Concat to string or Unicode object giving a new Unicode object. */
3826
3827PyObject *PyUnicode_Concat(PyObject *left,
3828			   PyObject *right)
3829{
3830    PyUnicodeObject *u = NULL, *v = NULL, *w;
3831
3832    /* Coerce the two arguments */
3833    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3834    if (u == NULL)
3835	goto onError;
3836    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3837    if (v == NULL)
3838	goto onError;
3839
3840    /* Shortcuts */
3841    if (v == unicode_empty) {
3842	Py_DECREF(v);
3843	return (PyObject *)u;
3844    }
3845    if (u == unicode_empty) {
3846	Py_DECREF(u);
3847	return (PyObject *)v;
3848    }
3849
3850    /* Concat the two Unicode strings */
3851    w = _PyUnicode_New(u->length + v->length);
3852    if (w == NULL)
3853	goto onError;
3854    Py_UNICODE_COPY(w->str, u->str, u->length);
3855    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3856
3857    Py_DECREF(u);
3858    Py_DECREF(v);
3859    return (PyObject *)w;
3860
3861onError:
3862    Py_XDECREF(u);
3863    Py_XDECREF(v);
3864    return NULL;
3865}
3866
3867static char count__doc__[] =
3868"S.count(sub[, start[, end]]) -> int\n\
3869\n\
3870Return the number of occurrences of substring sub in Unicode string\n\
3871S[start:end].  Optional arguments start and end are\n\
3872interpreted as in slice notation.";
3873
3874static PyObject *
3875unicode_count(PyUnicodeObject *self, PyObject *args)
3876{
3877    PyUnicodeObject *substring;
3878    int start = 0;
3879    int end = INT_MAX;
3880    PyObject *result;
3881
3882    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3883		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3884        return NULL;
3885
3886    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3887						(PyObject *)substring);
3888    if (substring == NULL)
3889	return NULL;
3890
3891    if (start < 0)
3892        start += self->length;
3893    if (start < 0)
3894        start = 0;
3895    if (end > self->length)
3896        end = self->length;
3897    if (end < 0)
3898        end += self->length;
3899    if (end < 0)
3900        end = 0;
3901
3902    result = PyInt_FromLong((long) count(self, start, end, substring));
3903
3904    Py_DECREF(substring);
3905    return result;
3906}
3907
3908static char encode__doc__[] =
3909"S.encode([encoding[,errors]]) -> string\n\
3910\n\
3911Return an encoded string version of S. Default encoding is the current\n\
3912default string encoding. errors may be given to set a different error\n\
3913handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3914a ValueError. Other possible values are 'ignore' and 'replace'.";
3915
3916static PyObject *
3917unicode_encode(PyUnicodeObject *self, PyObject *args)
3918{
3919    char *encoding = NULL;
3920    char *errors = NULL;
3921    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3922        return NULL;
3923    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3924}
3925
3926static char expandtabs__doc__[] =
3927"S.expandtabs([tabsize]) -> unicode\n\
3928\n\
3929Return a copy of S where all tab characters are expanded using spaces.\n\
3930If tabsize is not given, a tab size of 8 characters is assumed.";
3931
3932static PyObject*
3933unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3934{
3935    Py_UNICODE *e;
3936    Py_UNICODE *p;
3937    Py_UNICODE *q;
3938    int i, j;
3939    PyUnicodeObject *u;
3940    int tabsize = 8;
3941
3942    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3943	return NULL;
3944
3945    /* First pass: determine size of output string */
3946    i = j = 0;
3947    e = self->str + self->length;
3948    for (p = self->str; p < e; p++)
3949        if (*p == '\t') {
3950	    if (tabsize > 0)
3951		j += tabsize - (j % tabsize);
3952	}
3953        else {
3954            j++;
3955            if (*p == '\n' || *p == '\r') {
3956                i += j;
3957                j = 0;
3958            }
3959        }
3960
3961    /* Second pass: create output string and fill it */
3962    u = _PyUnicode_New(i + j);
3963    if (!u)
3964        return NULL;
3965
3966    j = 0;
3967    q = u->str;
3968
3969    for (p = self->str; p < e; p++)
3970        if (*p == '\t') {
3971	    if (tabsize > 0) {
3972		i = tabsize - (j % tabsize);
3973		j += i;
3974		while (i--)
3975		    *q++ = ' ';
3976	    }
3977	}
3978	else {
3979            j++;
3980	    *q++ = *p;
3981            if (*p == '\n' || *p == '\r')
3982                j = 0;
3983        }
3984
3985    return (PyObject*) u;
3986}
3987
3988static char find__doc__[] =
3989"S.find(sub [,start [,end]]) -> int\n\
3990\n\
3991Return the lowest index in S where substring sub is found,\n\
3992such that sub is contained within s[start,end].  Optional\n\
3993arguments start and end are interpreted as in slice notation.\n\
3994\n\
3995Return -1 on failure.";
3996
3997static PyObject *
3998unicode_find(PyUnicodeObject *self, PyObject *args)
3999{
4000    PyUnicodeObject *substring;
4001    int start = 0;
4002    int end = INT_MAX;
4003    PyObject *result;
4004
4005    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4006		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4007        return NULL;
4008    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4009						(PyObject *)substring);
4010    if (substring == NULL)
4011	return NULL;
4012
4013    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4014
4015    Py_DECREF(substring);
4016    return result;
4017}
4018
4019static PyObject *
4020unicode_getitem(PyUnicodeObject *self, int index)
4021{
4022    if (index < 0 || index >= self->length) {
4023        PyErr_SetString(PyExc_IndexError, "string index out of range");
4024        return NULL;
4025    }
4026
4027    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4028}
4029
4030static long
4031unicode_hash(PyUnicodeObject *self)
4032{
4033    /* Since Unicode objects compare equal to their ASCII string
4034       counterparts, they should use the individual character values
4035       as basis for their hash value.  This is needed to assure that
4036       strings and Unicode objects behave in the same way as
4037       dictionary keys. */
4038
4039    register int len;
4040    register Py_UNICODE *p;
4041    register long x;
4042
4043    if (self->hash != -1)
4044	return self->hash;
4045    len = PyUnicode_GET_SIZE(self);
4046    p = PyUnicode_AS_UNICODE(self);
4047    x = *p << 7;
4048    while (--len >= 0)
4049	x = (1000003*x) ^ *p++;
4050    x ^= PyUnicode_GET_SIZE(self);
4051    if (x == -1)
4052	x = -2;
4053    self->hash = x;
4054    return x;
4055}
4056
4057static char index__doc__[] =
4058"S.index(sub [,start [,end]]) -> int\n\
4059\n\
4060Like S.find() but raise ValueError when the substring is not found.";
4061
4062static PyObject *
4063unicode_index(PyUnicodeObject *self, PyObject *args)
4064{
4065    int result;
4066    PyUnicodeObject *substring;
4067    int start = 0;
4068    int end = INT_MAX;
4069
4070    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4071		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4072        return NULL;
4073
4074    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4075						(PyObject *)substring);
4076    if (substring == NULL)
4077	return NULL;
4078
4079    result = findstring(self, substring, start, end, 1);
4080
4081    Py_DECREF(substring);
4082    if (result < 0) {
4083        PyErr_SetString(PyExc_ValueError, "substring not found");
4084        return NULL;
4085    }
4086    return PyInt_FromLong(result);
4087}
4088
4089static char islower__doc__[] =
4090"S.islower() -> int\n\
4091\n\
4092Return 1 if  all cased characters in S are lowercase and there is\n\
4093at least one cased character in S, 0 otherwise.";
4094
4095static PyObject*
4096unicode_islower(PyUnicodeObject *self)
4097{
4098    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4099    register const Py_UNICODE *e;
4100    int cased;
4101
4102    /* Shortcut for single character strings */
4103    if (PyUnicode_GET_SIZE(self) == 1)
4104	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4105
4106    /* Special case for empty strings */
4107    if (PyString_GET_SIZE(self) == 0)
4108	return PyInt_FromLong(0);
4109
4110    e = p + PyUnicode_GET_SIZE(self);
4111    cased = 0;
4112    for (; p < e; p++) {
4113	register const Py_UNICODE ch = *p;
4114
4115	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4116	    return PyInt_FromLong(0);
4117	else if (!cased && Py_UNICODE_ISLOWER(ch))
4118	    cased = 1;
4119    }
4120    return PyInt_FromLong(cased);
4121}
4122
4123static char isupper__doc__[] =
4124"S.isupper() -> int\n\
4125\n\
4126Return 1 if  all cased characters in S are uppercase and there is\n\
4127at least one cased character in S, 0 otherwise.";
4128
4129static PyObject*
4130unicode_isupper(PyUnicodeObject *self)
4131{
4132    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4133    register const Py_UNICODE *e;
4134    int cased;
4135
4136    /* Shortcut for single character strings */
4137    if (PyUnicode_GET_SIZE(self) == 1)
4138	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4139
4140    /* Special case for empty strings */
4141    if (PyString_GET_SIZE(self) == 0)
4142	return PyInt_FromLong(0);
4143
4144    e = p + PyUnicode_GET_SIZE(self);
4145    cased = 0;
4146    for (; p < e; p++) {
4147	register const Py_UNICODE ch = *p;
4148
4149	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4150	    return PyInt_FromLong(0);
4151	else if (!cased && Py_UNICODE_ISUPPER(ch))
4152	    cased = 1;
4153    }
4154    return PyInt_FromLong(cased);
4155}
4156
4157static char istitle__doc__[] =
4158"S.istitle() -> int\n\
4159\n\
4160Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4161may only follow uncased characters and lowercase characters only cased\n\
4162ones. Return 0 otherwise.";
4163
4164static PyObject*
4165unicode_istitle(PyUnicodeObject *self)
4166{
4167    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4168    register const Py_UNICODE *e;
4169    int cased, previous_is_cased;
4170
4171    /* Shortcut for single character strings */
4172    if (PyUnicode_GET_SIZE(self) == 1)
4173	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4174			      (Py_UNICODE_ISUPPER(*p) != 0));
4175
4176    /* Special case for empty strings */
4177    if (PyString_GET_SIZE(self) == 0)
4178	return PyInt_FromLong(0);
4179
4180    e = p + PyUnicode_GET_SIZE(self);
4181    cased = 0;
4182    previous_is_cased = 0;
4183    for (; p < e; p++) {
4184	register const Py_UNICODE ch = *p;
4185
4186	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4187	    if (previous_is_cased)
4188		return PyInt_FromLong(0);
4189	    previous_is_cased = 1;
4190	    cased = 1;
4191	}
4192	else if (Py_UNICODE_ISLOWER(ch)) {
4193	    if (!previous_is_cased)
4194		return PyInt_FromLong(0);
4195	    previous_is_cased = 1;
4196	    cased = 1;
4197	}
4198	else
4199	    previous_is_cased = 0;
4200    }
4201    return PyInt_FromLong(cased);
4202}
4203
4204static char isspace__doc__[] =
4205"S.isspace() -> int\n\
4206\n\
4207Return 1 if there are only whitespace characters in S,\n\
42080 otherwise.";
4209
4210static PyObject*
4211unicode_isspace(PyUnicodeObject *self)
4212{
4213    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4214    register const Py_UNICODE *e;
4215
4216    /* Shortcut for single character strings */
4217    if (PyUnicode_GET_SIZE(self) == 1 &&
4218	Py_UNICODE_ISSPACE(*p))
4219	return PyInt_FromLong(1);
4220
4221    /* Special case for empty strings */
4222    if (PyString_GET_SIZE(self) == 0)
4223	return PyInt_FromLong(0);
4224
4225    e = p + PyUnicode_GET_SIZE(self);
4226    for (; p < e; p++) {
4227	if (!Py_UNICODE_ISSPACE(*p))
4228	    return PyInt_FromLong(0);
4229    }
4230    return PyInt_FromLong(1);
4231}
4232
4233static char isalpha__doc__[] =
4234"S.isalpha() -> int\n\
4235\n\
4236Return 1 if  all characters in S are alphabetic\n\
4237and there is at least one character in S, 0 otherwise.";
4238
4239static PyObject*
4240unicode_isalpha(PyUnicodeObject *self)
4241{
4242    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4243    register const Py_UNICODE *e;
4244
4245    /* Shortcut for single character strings */
4246    if (PyUnicode_GET_SIZE(self) == 1 &&
4247	Py_UNICODE_ISALPHA(*p))
4248	return PyInt_FromLong(1);
4249
4250    /* Special case for empty strings */
4251    if (PyString_GET_SIZE(self) == 0)
4252	return PyInt_FromLong(0);
4253
4254    e = p + PyUnicode_GET_SIZE(self);
4255    for (; p < e; p++) {
4256	if (!Py_UNICODE_ISALPHA(*p))
4257	    return PyInt_FromLong(0);
4258    }
4259    return PyInt_FromLong(1);
4260}
4261
4262static char isalnum__doc__[] =
4263"S.isalnum() -> int\n\
4264\n\
4265Return 1 if  all characters in S are alphanumeric\n\
4266and there is at least one character in S, 0 otherwise.";
4267
4268static PyObject*
4269unicode_isalnum(PyUnicodeObject *self)
4270{
4271    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4272    register const Py_UNICODE *e;
4273
4274    /* Shortcut for single character strings */
4275    if (PyUnicode_GET_SIZE(self) == 1 &&
4276	Py_UNICODE_ISALNUM(*p))
4277	return PyInt_FromLong(1);
4278
4279    /* Special case for empty strings */
4280    if (PyString_GET_SIZE(self) == 0)
4281	return PyInt_FromLong(0);
4282
4283    e = p + PyUnicode_GET_SIZE(self);
4284    for (; p < e; p++) {
4285	if (!Py_UNICODE_ISALNUM(*p))
4286	    return PyInt_FromLong(0);
4287    }
4288    return PyInt_FromLong(1);
4289}
4290
4291static char isdecimal__doc__[] =
4292"S.isdecimal() -> int\n\
4293\n\
4294Return 1 if there are only decimal characters in S,\n\
42950 otherwise.";
4296
4297static PyObject*
4298unicode_isdecimal(PyUnicodeObject *self)
4299{
4300    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4301    register const Py_UNICODE *e;
4302
4303    /* Shortcut for single character strings */
4304    if (PyUnicode_GET_SIZE(self) == 1 &&
4305	Py_UNICODE_ISDECIMAL(*p))
4306	return PyInt_FromLong(1);
4307
4308    /* Special case for empty strings */
4309    if (PyString_GET_SIZE(self) == 0)
4310	return PyInt_FromLong(0);
4311
4312    e = p + PyUnicode_GET_SIZE(self);
4313    for (; p < e; p++) {
4314	if (!Py_UNICODE_ISDECIMAL(*p))
4315	    return PyInt_FromLong(0);
4316    }
4317    return PyInt_FromLong(1);
4318}
4319
4320static char isdigit__doc__[] =
4321"S.isdigit() -> int\n\
4322\n\
4323Return 1 if there are only digit characters in S,\n\
43240 otherwise.";
4325
4326static PyObject*
4327unicode_isdigit(PyUnicodeObject *self)
4328{
4329    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4330    register const Py_UNICODE *e;
4331
4332    /* Shortcut for single character strings */
4333    if (PyUnicode_GET_SIZE(self) == 1 &&
4334	Py_UNICODE_ISDIGIT(*p))
4335	return PyInt_FromLong(1);
4336
4337    /* Special case for empty strings */
4338    if (PyString_GET_SIZE(self) == 0)
4339	return PyInt_FromLong(0);
4340
4341    e = p + PyUnicode_GET_SIZE(self);
4342    for (; p < e; p++) {
4343	if (!Py_UNICODE_ISDIGIT(*p))
4344	    return PyInt_FromLong(0);
4345    }
4346    return PyInt_FromLong(1);
4347}
4348
4349static char isnumeric__doc__[] =
4350"S.isnumeric() -> int\n\
4351\n\
4352Return 1 if there are only numeric characters in S,\n\
43530 otherwise.";
4354
4355static PyObject*
4356unicode_isnumeric(PyUnicodeObject *self)
4357{
4358    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4359    register const Py_UNICODE *e;
4360
4361    /* Shortcut for single character strings */
4362    if (PyUnicode_GET_SIZE(self) == 1 &&
4363	Py_UNICODE_ISNUMERIC(*p))
4364	return PyInt_FromLong(1);
4365
4366    /* Special case for empty strings */
4367    if (PyString_GET_SIZE(self) == 0)
4368	return PyInt_FromLong(0);
4369
4370    e = p + PyUnicode_GET_SIZE(self);
4371    for (; p < e; p++) {
4372	if (!Py_UNICODE_ISNUMERIC(*p))
4373	    return PyInt_FromLong(0);
4374    }
4375    return PyInt_FromLong(1);
4376}
4377
4378static char join__doc__[] =
4379"S.join(sequence) -> unicode\n\
4380\n\
4381Return a string which is the concatenation of the strings in the\n\
4382sequence.  The separator between elements is S.";
4383
4384static PyObject*
4385unicode_join(PyObject *self, PyObject *data)
4386{
4387    return PyUnicode_Join(self, data);
4388}
4389
4390static int
4391unicode_length(PyUnicodeObject *self)
4392{
4393    return self->length;
4394}
4395
4396static char ljust__doc__[] =
4397"S.ljust(width) -> unicode\n\
4398\n\
4399Return S left justified in a Unicode string of length width. Padding is\n\
4400done using spaces.";
4401
4402static PyObject *
4403unicode_ljust(PyUnicodeObject *self, PyObject *args)
4404{
4405    int width;
4406    if (!PyArg_ParseTuple(args, "i:ljust", &width))
4407        return NULL;
4408
4409    if (self->length >= width && PyUnicode_CheckExact(self)) {
4410        Py_INCREF(self);
4411        return (PyObject*) self;
4412    }
4413
4414    return (PyObject*) pad(self, 0, width - self->length, ' ');
4415}
4416
4417static char lower__doc__[] =
4418"S.lower() -> unicode\n\
4419\n\
4420Return a copy of the string S converted to lowercase.";
4421
4422static PyObject*
4423unicode_lower(PyUnicodeObject *self)
4424{
4425    return fixup(self, fixlower);
4426}
4427
4428static char lstrip__doc__[] =
4429"S.lstrip() -> unicode\n\
4430\n\
4431Return a copy of the string S with leading whitespace removed.";
4432
4433static PyObject *
4434unicode_lstrip(PyUnicodeObject *self)
4435{
4436    return strip(self, 1, 0);
4437}
4438
4439static PyObject*
4440unicode_repeat(PyUnicodeObject *str, int len)
4441{
4442    PyUnicodeObject *u;
4443    Py_UNICODE *p;
4444    int nchars;
4445    size_t nbytes;
4446
4447    if (len < 0)
4448        len = 0;
4449
4450    if (len == 1 && PyUnicode_CheckExact(str)) {
4451        /* no repeat, return original string */
4452        Py_INCREF(str);
4453        return (PyObject*) str;
4454    }
4455
4456    /* ensure # of chars needed doesn't overflow int and # of bytes
4457     * needed doesn't overflow size_t
4458     */
4459    nchars = len * str->length;
4460    if (len && nchars / len != str->length) {
4461        PyErr_SetString(PyExc_OverflowError,
4462                        "repeated string is too long");
4463        return NULL;
4464    }
4465    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4466    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4467        PyErr_SetString(PyExc_OverflowError,
4468                        "repeated string is too long");
4469        return NULL;
4470    }
4471    u = _PyUnicode_New(nchars);
4472    if (!u)
4473        return NULL;
4474
4475    p = u->str;
4476
4477    while (len-- > 0) {
4478        Py_UNICODE_COPY(p, str->str, str->length);
4479        p += str->length;
4480    }
4481
4482    return (PyObject*) u;
4483}
4484
4485PyObject *PyUnicode_Replace(PyObject *obj,
4486			    PyObject *subobj,
4487			    PyObject *replobj,
4488			    int maxcount)
4489{
4490    PyObject *self;
4491    PyObject *str1;
4492    PyObject *str2;
4493    PyObject *result;
4494
4495    self = PyUnicode_FromObject(obj);
4496    if (self == NULL)
4497	return NULL;
4498    str1 = PyUnicode_FromObject(subobj);
4499    if (str1 == NULL) {
4500	Py_DECREF(self);
4501	return NULL;
4502    }
4503    str2 = PyUnicode_FromObject(replobj);
4504    if (str2 == NULL) {
4505	Py_DECREF(self);
4506	Py_DECREF(str1);
4507	return NULL;
4508    }
4509    result = replace((PyUnicodeObject *)self,
4510		     (PyUnicodeObject *)str1,
4511		     (PyUnicodeObject *)str2,
4512		     maxcount);
4513    Py_DECREF(self);
4514    Py_DECREF(str1);
4515    Py_DECREF(str2);
4516    return result;
4517}
4518
4519static char replace__doc__[] =
4520"S.replace (old, new[, maxsplit]) -> unicode\n\
4521\n\
4522Return a copy of S with all occurrences of substring\n\
4523old replaced by new.  If the optional argument maxsplit is\n\
4524given, only the first maxsplit occurrences are replaced.";
4525
4526static PyObject*
4527unicode_replace(PyUnicodeObject *self, PyObject *args)
4528{
4529    PyUnicodeObject *str1;
4530    PyUnicodeObject *str2;
4531    int maxcount = -1;
4532    PyObject *result;
4533
4534    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4535        return NULL;
4536    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4537    if (str1 == NULL)
4538	return NULL;
4539    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4540    if (str2 == NULL)
4541	return NULL;
4542
4543    result = replace(self, str1, str2, maxcount);
4544
4545    Py_DECREF(str1);
4546    Py_DECREF(str2);
4547    return result;
4548}
4549
4550static
4551PyObject *unicode_repr(PyObject *unicode)
4552{
4553    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4554				PyUnicode_GET_SIZE(unicode),
4555				1);
4556}
4557
4558static char rfind__doc__[] =
4559"S.rfind(sub [,start [,end]]) -> int\n\
4560\n\
4561Return the highest index in S where substring sub is found,\n\
4562such that sub is contained within s[start,end].  Optional\n\
4563arguments start and end are interpreted as in slice notation.\n\
4564\n\
4565Return -1 on failure.";
4566
4567static PyObject *
4568unicode_rfind(PyUnicodeObject *self, PyObject *args)
4569{
4570    PyUnicodeObject *substring;
4571    int start = 0;
4572    int end = INT_MAX;
4573    PyObject *result;
4574
4575    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4576		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4577        return NULL;
4578    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4579						(PyObject *)substring);
4580    if (substring == NULL)
4581	return NULL;
4582
4583    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4584
4585    Py_DECREF(substring);
4586    return result;
4587}
4588
4589static char rindex__doc__[] =
4590"S.rindex(sub [,start [,end]]) -> int\n\
4591\n\
4592Like S.rfind() but raise ValueError when the substring is not found.";
4593
4594static PyObject *
4595unicode_rindex(PyUnicodeObject *self, PyObject *args)
4596{
4597    int result;
4598    PyUnicodeObject *substring;
4599    int start = 0;
4600    int end = INT_MAX;
4601
4602    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4603		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4604        return NULL;
4605    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4606						(PyObject *)substring);
4607    if (substring == NULL)
4608	return NULL;
4609
4610    result = findstring(self, substring, start, end, -1);
4611
4612    Py_DECREF(substring);
4613    if (result < 0) {
4614        PyErr_SetString(PyExc_ValueError, "substring not found");
4615        return NULL;
4616    }
4617    return PyInt_FromLong(result);
4618}
4619
4620static char rjust__doc__[] =
4621"S.rjust(width) -> unicode\n\
4622\n\
4623Return S right justified in a Unicode string of length width. Padding is\n\
4624done using spaces.";
4625
4626static PyObject *
4627unicode_rjust(PyUnicodeObject *self, PyObject *args)
4628{
4629    int width;
4630    if (!PyArg_ParseTuple(args, "i:rjust", &width))
4631        return NULL;
4632
4633    if (self->length >= width && PyUnicode_CheckExact(self)) {
4634        Py_INCREF(self);
4635        return (PyObject*) self;
4636    }
4637
4638    return (PyObject*) pad(self, width - self->length, 0, ' ');
4639}
4640
4641static char rstrip__doc__[] =
4642"S.rstrip() -> unicode\n\
4643\n\
4644Return a copy of the string S with trailing whitespace removed.";
4645
4646static PyObject *
4647unicode_rstrip(PyUnicodeObject *self)
4648{
4649    return strip(self, 0, 1);
4650}
4651
4652static PyObject*
4653unicode_slice(PyUnicodeObject *self, int start, int end)
4654{
4655    /* standard clamping */
4656    if (start < 0)
4657        start = 0;
4658    if (end < 0)
4659        end = 0;
4660    if (end > self->length)
4661        end = self->length;
4662    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
4663        /* full slice, return original string */
4664        Py_INCREF(self);
4665        return (PyObject*) self;
4666    }
4667    if (start > end)
4668        start = end;
4669    /* copy slice */
4670    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4671					     end - start);
4672}
4673
4674PyObject *PyUnicode_Split(PyObject *s,
4675			  PyObject *sep,
4676			  int maxsplit)
4677{
4678    PyObject *result;
4679
4680    s = PyUnicode_FromObject(s);
4681    if (s == NULL)
4682	return NULL;
4683    if (sep != NULL) {
4684	sep = PyUnicode_FromObject(sep);
4685	if (sep == NULL) {
4686	    Py_DECREF(s);
4687	    return NULL;
4688	}
4689    }
4690
4691    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4692
4693    Py_DECREF(s);
4694    Py_XDECREF(sep);
4695    return result;
4696}
4697
4698static char split__doc__[] =
4699"S.split([sep [,maxsplit]]) -> list of strings\n\
4700\n\
4701Return a list of the words in S, using sep as the\n\
4702delimiter string.  If maxsplit is given, at most maxsplit\n\
4703splits are done. If sep is not specified, any whitespace string\n\
4704is a separator.";
4705
4706static PyObject*
4707unicode_split(PyUnicodeObject *self, PyObject *args)
4708{
4709    PyObject *substring = Py_None;
4710    int maxcount = -1;
4711
4712    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4713        return NULL;
4714
4715    if (substring == Py_None)
4716	return split(self, NULL, maxcount);
4717    else if (PyUnicode_Check(substring))
4718	return split(self, (PyUnicodeObject *)substring, maxcount);
4719    else
4720	return PyUnicode_Split((PyObject *)self, substring, maxcount);
4721}
4722
4723static char splitlines__doc__[] =
4724"S.splitlines([keepends]]) -> list of strings\n\
4725\n\
4726Return a list of the lines in S, breaking at line boundaries.\n\
4727Line breaks are not included in the resulting list unless keepends\n\
4728is given and true.";
4729
4730static PyObject*
4731unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4732{
4733    int keepends = 0;
4734
4735    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4736        return NULL;
4737
4738    return PyUnicode_Splitlines((PyObject *)self, keepends);
4739}
4740
4741static
4742PyObject *unicode_str(PyUnicodeObject *self)
4743{
4744    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4745}
4746
4747static char strip__doc__[] =
4748"S.strip() -> unicode\n\
4749\n\
4750Return a copy of S with leading and trailing whitespace removed.";
4751
4752static PyObject *
4753unicode_strip(PyUnicodeObject *self)
4754{
4755    return strip(self, 1, 1);
4756}
4757
4758static char swapcase__doc__[] =
4759"S.swapcase() -> unicode\n\
4760\n\
4761Return a copy of S with uppercase characters converted to lowercase\n\
4762and vice versa.";
4763
4764static PyObject*
4765unicode_swapcase(PyUnicodeObject *self)
4766{
4767    return fixup(self, fixswapcase);
4768}
4769
4770static char translate__doc__[] =
4771"S.translate(table) -> unicode\n\
4772\n\
4773Return a copy of the string S, where all characters have been mapped\n\
4774through the given translation table, which must be a mapping of\n\
4775Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4776are left untouched. Characters mapped to None are deleted.";
4777
4778static PyObject*
4779unicode_translate(PyUnicodeObject *self, PyObject *table)
4780{
4781    return PyUnicode_TranslateCharmap(self->str,
4782				      self->length,
4783				      table,
4784				      "ignore");
4785}
4786
4787static char upper__doc__[] =
4788"S.upper() -> unicode\n\
4789\n\
4790Return a copy of S converted to uppercase.";
4791
4792static PyObject*
4793unicode_upper(PyUnicodeObject *self)
4794{
4795    return fixup(self, fixupper);
4796}
4797
4798#if 0
4799static char zfill__doc__[] =
4800"S.zfill(width) -> unicode\n\
4801\n\
4802Pad a numeric string x with zeros on the left, to fill a field\n\
4803of the specified width. The string x is never truncated.";
4804
4805static PyObject *
4806unicode_zfill(PyUnicodeObject *self, PyObject *args)
4807{
4808    int fill;
4809    PyUnicodeObject *u;
4810
4811    int width;
4812    if (!PyArg_ParseTuple(args, "i:zfill", &width))
4813        return NULL;
4814
4815    if (self->length >= width) {
4816        Py_INCREF(self);
4817        return (PyObject*) self;
4818    }
4819
4820    fill = width - self->length;
4821
4822    u = pad(self, fill, 0, '0');
4823
4824    if (u->str[fill] == '+' || u->str[fill] == '-') {
4825        /* move sign to beginning of string */
4826        u->str[0] = u->str[fill];
4827        u->str[fill] = '0';
4828    }
4829
4830    return (PyObject*) u;
4831}
4832#endif
4833
4834#if 0
4835static PyObject*
4836unicode_freelistsize(PyUnicodeObject *self)
4837{
4838    return PyInt_FromLong(unicode_freelist_size);
4839}
4840#endif
4841
4842static char startswith__doc__[] =
4843"S.startswith(prefix[, start[, end]]) -> int\n\
4844\n\
4845Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4846optional start, test S beginning at that position.  With optional end, stop\n\
4847comparing S at that position.";
4848
4849static PyObject *
4850unicode_startswith(PyUnicodeObject *self,
4851		   PyObject *args)
4852{
4853    PyUnicodeObject *substring;
4854    int start = 0;
4855    int end = INT_MAX;
4856    PyObject *result;
4857
4858    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4859		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4860	return NULL;
4861    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4862						(PyObject *)substring);
4863    if (substring == NULL)
4864	return NULL;
4865
4866    result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4867
4868    Py_DECREF(substring);
4869    return result;
4870}
4871
4872
4873static char endswith__doc__[] =
4874"S.endswith(suffix[, start[, end]]) -> int\n\
4875\n\
4876Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4877optional start, test S beginning at that position.  With optional end, stop\n\
4878comparing S at that position.";
4879
4880static PyObject *
4881unicode_endswith(PyUnicodeObject *self,
4882		 PyObject *args)
4883{
4884    PyUnicodeObject *substring;
4885    int start = 0;
4886    int end = INT_MAX;
4887    PyObject *result;
4888
4889    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4890		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4891	return NULL;
4892    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4893						(PyObject *)substring);
4894    if (substring == NULL)
4895	return NULL;
4896
4897    result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4898
4899    Py_DECREF(substring);
4900    return result;
4901}
4902
4903
4904static PyMethodDef unicode_methods[] = {
4905
4906    /* Order is according to common usage: often used methods should
4907       appear first, since lookup is done sequentially. */
4908
4909    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4910    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4911    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4912    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4913    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4914    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4915    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4916    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4917    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4918    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4919    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4920    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4921    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4922    {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4923/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4924    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4925    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4926    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4927    {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4928    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4929    {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4930    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4931    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4932    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4933    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4934    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4935    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4936    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4937    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4938    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4939    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4940    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4941    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4942    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4943    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
4944#if 0
4945    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4946    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
4947#endif
4948
4949#if 0
4950    /* This one is just used for debugging the implementation. */
4951    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
4952#endif
4953
4954    {NULL, NULL}
4955};
4956
4957static PySequenceMethods unicode_as_sequence = {
4958    (inquiry) unicode_length, 		/* sq_length */
4959    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
4960    (intargfunc) unicode_repeat, 	/* sq_repeat */
4961    (intargfunc) unicode_getitem, 	/* sq_item */
4962    (intintargfunc) unicode_slice, 	/* sq_slice */
4963    0, 					/* sq_ass_item */
4964    0, 					/* sq_ass_slice */
4965    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
4966};
4967
4968static int
4969unicode_buffer_getreadbuf(PyUnicodeObject *self,
4970			  int index,
4971			  const void **ptr)
4972{
4973    if (index != 0) {
4974        PyErr_SetString(PyExc_SystemError,
4975			"accessing non-existent unicode segment");
4976        return -1;
4977    }
4978    *ptr = (void *) self->str;
4979    return PyUnicode_GET_DATA_SIZE(self);
4980}
4981
4982static int
4983unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4984			   const void **ptr)
4985{
4986    PyErr_SetString(PyExc_TypeError,
4987		    "cannot use unicode as modifyable buffer");
4988    return -1;
4989}
4990
4991static int
4992unicode_buffer_getsegcount(PyUnicodeObject *self,
4993			   int *lenp)
4994{
4995    if (lenp)
4996        *lenp = PyUnicode_GET_DATA_SIZE(self);
4997    return 1;
4998}
4999
5000static int
5001unicode_buffer_getcharbuf(PyUnicodeObject *self,
5002			  int index,
5003			  const void **ptr)
5004{
5005    PyObject *str;
5006
5007    if (index != 0) {
5008        PyErr_SetString(PyExc_SystemError,
5009			"accessing non-existent unicode segment");
5010        return -1;
5011    }
5012    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5013    if (str == NULL)
5014	return -1;
5015    *ptr = (void *) PyString_AS_STRING(str);
5016    return PyString_GET_SIZE(str);
5017}
5018
5019/* Helpers for PyUnicode_Format() */
5020
5021static PyObject *
5022getnextarg(PyObject *args, int arglen, int *p_argidx)
5023{
5024    int argidx = *p_argidx;
5025    if (argidx < arglen) {
5026	(*p_argidx)++;
5027	if (arglen < 0)
5028	    return args;
5029	else
5030	    return PyTuple_GetItem(args, argidx);
5031    }
5032    PyErr_SetString(PyExc_TypeError,
5033		    "not enough arguments for format string");
5034    return NULL;
5035}
5036
5037#define F_LJUST (1<<0)
5038#define F_SIGN	(1<<1)
5039#define F_BLANK (1<<2)
5040#define F_ALT	(1<<3)
5041#define F_ZERO	(1<<4)
5042
5043static
5044int usprintf(register Py_UNICODE *buffer, char *format, ...)
5045{
5046    register int i;
5047    int len;
5048    va_list va;
5049    char *charbuffer;
5050    va_start(va, format);
5051
5052    /* First, format the string as char array, then expand to Py_UNICODE
5053       array. */
5054    charbuffer = (char *)buffer;
5055    len = vsprintf(charbuffer, format, va);
5056    for (i = len - 1; i >= 0; i--)
5057	buffer[i] = (Py_UNICODE) charbuffer[i];
5058
5059    va_end(va);
5060    return len;
5061}
5062
5063static int
5064formatfloat(Py_UNICODE *buf,
5065	    size_t buflen,
5066	    int flags,
5067	    int prec,
5068	    int type,
5069	    PyObject *v)
5070{
5071    /* fmt = '%#.' + `prec` + `type`
5072       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5073    char fmt[20];
5074    double x;
5075
5076    x = PyFloat_AsDouble(v);
5077    if (x == -1.0 && PyErr_Occurred())
5078	return -1;
5079    if (prec < 0)
5080	prec = 6;
5081    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5082	type = 'g';
5083    sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
5084    /* worst case length calc to ensure no buffer overrun:
5085         fmt = %#.<prec>g
5086         buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5087            for any double rep.)
5088         len = 1 + prec + 1 + 2 + 5 = 9 + prec
5089       If prec=0 the effective precision is 1 (the leading digit is
5090       always given), therefore increase by one to 10+prec. */
5091    if (buflen <= (size_t)10 + (size_t)prec) {
5092	PyErr_SetString(PyExc_OverflowError,
5093	    "formatted float is too long (precision too long?)");
5094	return -1;
5095    }
5096    return usprintf(buf, fmt, x);
5097}
5098
5099static PyObject*
5100formatlong(PyObject *val, int flags, int prec, int type)
5101{
5102	char *buf;
5103	int i, len;
5104	PyObject *str; /* temporary string object. */
5105	PyUnicodeObject *result;
5106
5107	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5108	if (!str)
5109		return NULL;
5110	result = _PyUnicode_New(len);
5111	for (i = 0; i < len; i++)
5112		result->str[i] = buf[i];
5113	result->str[len] = 0;
5114	Py_DECREF(str);
5115	return (PyObject*)result;
5116}
5117
5118static int
5119formatint(Py_UNICODE *buf,
5120	  size_t buflen,
5121	  int flags,
5122	  int prec,
5123	  int type,
5124	  PyObject *v)
5125{
5126    /* fmt = '%#.' + `prec` + 'l' + `type`
5127       worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5128       + 1 + 1 = 24*/
5129    char fmt[64]; /* plenty big enough! */
5130    long x;
5131    int use_native_c_format = 1;
5132
5133    x = PyInt_AsLong(v);
5134    if (x == -1 && PyErr_Occurred())
5135	return -1;
5136    if (prec < 0)
5137	prec = 1;
5138    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5139       worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5140    if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5141        PyErr_SetString(PyExc_OverflowError,
5142            "formatted integer is too long (precision too long?)");
5143        return -1;
5144    }
5145    /* When converting 0 under %#x or %#X, C leaves off the base marker,
5146     * but we want it (for consistency with other %#x conversions, and
5147     * for consistency with Python's hex() function).
5148     * BUG 28-Apr-2001 tim:  At least two platform Cs (Metrowerks &
5149     * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5150     * So add it only if the platform doesn't already.
5151     */
5152    if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5153        /* Only way to know what the platform does is to try it. */
5154        sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
5155        if (fmt[1] != (char)type) {
5156            /* Supply our own leading 0x/0X -- needed under std C */
5157            use_native_c_format = 0;
5158            sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
5159        }
5160    }
5161    if (use_native_c_format)
5162         sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
5163    return usprintf(buf, fmt, x);
5164}
5165
5166static int
5167formatchar(Py_UNICODE *buf,
5168           size_t buflen,
5169           PyObject *v)
5170{
5171    /* presume that the buffer is at least 2 characters long */
5172    if (PyUnicode_Check(v)) {
5173	if (PyUnicode_GET_SIZE(v) != 1)
5174	    goto onError;
5175	buf[0] = PyUnicode_AS_UNICODE(v)[0];
5176    }
5177
5178    else if (PyString_Check(v)) {
5179	if (PyString_GET_SIZE(v) != 1)
5180	    goto onError;
5181	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5182    }
5183
5184    else {
5185	/* Integer input truncated to a character */
5186        long x;
5187	x = PyInt_AsLong(v);
5188	if (x == -1 && PyErr_Occurred())
5189	    goto onError;
5190	buf[0] = (char) x;
5191    }
5192    buf[1] = '\0';
5193    return 1;
5194
5195 onError:
5196    PyErr_SetString(PyExc_TypeError,
5197		    "%c requires int or char");
5198    return -1;
5199}
5200
5201/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5202
5203   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5204   chars are formatted. XXX This is a magic number. Each formatting
5205   routine does bounds checking to ensure no overflow, but a better
5206   solution may be to malloc a buffer of appropriate size for each
5207   format. For now, the current solution is sufficient.
5208*/
5209#define FORMATBUFLEN (size_t)120
5210
5211PyObject *PyUnicode_Format(PyObject *format,
5212			   PyObject *args)
5213{
5214    Py_UNICODE *fmt, *res;
5215    int fmtcnt, rescnt, reslen, arglen, argidx;
5216    int args_owned = 0;
5217    PyUnicodeObject *result = NULL;
5218    PyObject *dict = NULL;
5219    PyObject *uformat;
5220
5221    if (format == NULL || args == NULL) {
5222	PyErr_BadInternalCall();
5223	return NULL;
5224    }
5225    uformat = PyUnicode_FromObject(format);
5226    if (uformat == NULL)
5227	return NULL;
5228    fmt = PyUnicode_AS_UNICODE(uformat);
5229    fmtcnt = PyUnicode_GET_SIZE(uformat);
5230
5231    reslen = rescnt = fmtcnt + 100;
5232    result = _PyUnicode_New(reslen);
5233    if (result == NULL)
5234	goto onError;
5235    res = PyUnicode_AS_UNICODE(result);
5236
5237    if (PyTuple_Check(args)) {
5238	arglen = PyTuple_Size(args);
5239	argidx = 0;
5240    }
5241    else {
5242	arglen = -1;
5243	argidx = -2;
5244    }
5245    if (args->ob_type->tp_as_mapping)
5246	dict = args;
5247
5248    while (--fmtcnt >= 0) {
5249	if (*fmt != '%') {
5250	    if (--rescnt < 0) {
5251		rescnt = fmtcnt + 100;
5252		reslen += rescnt;
5253		if (_PyUnicode_Resize(&result, reslen) < 0)
5254		    return NULL;
5255		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5256		--rescnt;
5257	    }
5258	    *res++ = *fmt++;
5259	}
5260	else {
5261	    /* Got a format specifier */
5262	    int flags = 0;
5263	    int width = -1;
5264	    int prec = -1;
5265	    Py_UNICODE c = '\0';
5266	    Py_UNICODE fill;
5267	    PyObject *v = NULL;
5268	    PyObject *temp = NULL;
5269	    Py_UNICODE *pbuf;
5270	    Py_UNICODE sign;
5271	    int len;
5272	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5273
5274	    fmt++;
5275	    if (*fmt == '(') {
5276		Py_UNICODE *keystart;
5277		int keylen;
5278		PyObject *key;
5279		int pcount = 1;
5280
5281		if (dict == NULL) {
5282		    PyErr_SetString(PyExc_TypeError,
5283				    "format requires a mapping");
5284		    goto onError;
5285		}
5286		++fmt;
5287		--fmtcnt;
5288		keystart = fmt;
5289		/* Skip over balanced parentheses */
5290		while (pcount > 0 && --fmtcnt >= 0) {
5291		    if (*fmt == ')')
5292			--pcount;
5293		    else if (*fmt == '(')
5294			++pcount;
5295		    fmt++;
5296		}
5297		keylen = fmt - keystart - 1;
5298		if (fmtcnt < 0 || pcount > 0) {
5299		    PyErr_SetString(PyExc_ValueError,
5300				    "incomplete format key");
5301		    goto onError;
5302		}
5303		/* keys are converted to strings using UTF-8 and
5304		   then looked up since Python uses strings to hold
5305		   variables names etc. in its namespaces and we
5306		   wouldn't want to break common idioms. */
5307		key = PyUnicode_EncodeUTF8(keystart,
5308					   keylen,
5309					   NULL);
5310		if (key == NULL)
5311		    goto onError;
5312		if (args_owned) {
5313		    Py_DECREF(args);
5314		    args_owned = 0;
5315		}
5316		args = PyObject_GetItem(dict, key);
5317		Py_DECREF(key);
5318		if (args == NULL) {
5319		    goto onError;
5320		}
5321		args_owned = 1;
5322		arglen = -1;
5323		argidx = -2;
5324	    }
5325	    while (--fmtcnt >= 0) {
5326		switch (c = *fmt++) {
5327		case '-': flags |= F_LJUST; continue;
5328		case '+': flags |= F_SIGN; continue;
5329		case ' ': flags |= F_BLANK; continue;
5330		case '#': flags |= F_ALT; continue;
5331		case '0': flags |= F_ZERO; continue;
5332		}
5333		break;
5334	    }
5335	    if (c == '*') {
5336		v = getnextarg(args, arglen, &argidx);
5337		if (v == NULL)
5338		    goto onError;
5339		if (!PyInt_Check(v)) {
5340		    PyErr_SetString(PyExc_TypeError,
5341				    "* wants int");
5342		    goto onError;
5343		}
5344		width = PyInt_AsLong(v);
5345		if (width < 0) {
5346		    flags |= F_LJUST;
5347		    width = -width;
5348		}
5349		if (--fmtcnt >= 0)
5350		    c = *fmt++;
5351	    }
5352	    else if (c >= '0' && c <= '9') {
5353		width = c - '0';
5354		while (--fmtcnt >= 0) {
5355		    c = *fmt++;
5356		    if (c < '0' || c > '9')
5357			break;
5358		    if ((width*10) / 10 != width) {
5359			PyErr_SetString(PyExc_ValueError,
5360					"width too big");
5361			goto onError;
5362		    }
5363		    width = width*10 + (c - '0');
5364		}
5365	    }
5366	    if (c == '.') {
5367		prec = 0;
5368		if (--fmtcnt >= 0)
5369		    c = *fmt++;
5370		if (c == '*') {
5371		    v = getnextarg(args, arglen, &argidx);
5372		    if (v == NULL)
5373			goto onError;
5374		    if (!PyInt_Check(v)) {
5375			PyErr_SetString(PyExc_TypeError,
5376					"* wants int");
5377			goto onError;
5378		    }
5379		    prec = PyInt_AsLong(v);
5380		    if (prec < 0)
5381			prec = 0;
5382		    if (--fmtcnt >= 0)
5383			c = *fmt++;
5384		}
5385		else if (c >= '0' && c <= '9') {
5386		    prec = c - '0';
5387		    while (--fmtcnt >= 0) {
5388			c = Py_CHARMASK(*fmt++);
5389			if (c < '0' || c > '9')
5390			    break;
5391			if ((prec*10) / 10 != prec) {
5392			    PyErr_SetString(PyExc_ValueError,
5393					    "prec too big");
5394			    goto onError;
5395			}
5396			prec = prec*10 + (c - '0');
5397		    }
5398		}
5399	    } /* prec */
5400	    if (fmtcnt >= 0) {
5401		if (c == 'h' || c == 'l' || c == 'L') {
5402		    if (--fmtcnt >= 0)
5403			c = *fmt++;
5404		}
5405	    }
5406	    if (fmtcnt < 0) {
5407		PyErr_SetString(PyExc_ValueError,
5408				"incomplete format");
5409		goto onError;
5410	    }
5411	    if (c != '%') {
5412		v = getnextarg(args, arglen, &argidx);
5413		if (v == NULL)
5414		    goto onError;
5415	    }
5416	    sign = 0;
5417	    fill = ' ';
5418	    switch (c) {
5419
5420	    case '%':
5421		pbuf = formatbuf;
5422		/* presume that buffer length is at least 1 */
5423		pbuf[0] = '%';
5424		len = 1;
5425		break;
5426
5427	    case 's':
5428	    case 'r':
5429		if (PyUnicode_Check(v) && c == 's') {
5430		    temp = v;
5431		    Py_INCREF(temp);
5432		}
5433		else {
5434		    PyObject *unicode;
5435		    if (c == 's')
5436			temp = PyObject_Str(v);
5437		    else
5438			temp = PyObject_Repr(v);
5439		    if (temp == NULL)
5440			goto onError;
5441		    if (!PyString_Check(temp)) {
5442			/* XXX Note: this should never happen, since
5443   			       PyObject_Repr() and PyObject_Str() assure
5444			       this */
5445			Py_DECREF(temp);
5446			PyErr_SetString(PyExc_TypeError,
5447					"%s argument has non-string str()");
5448			goto onError;
5449		    }
5450		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5451						   PyString_GET_SIZE(temp),
5452					       NULL,
5453						   "strict");
5454		    Py_DECREF(temp);
5455		    temp = unicode;
5456		    if (temp == NULL)
5457			goto onError;
5458		}
5459		pbuf = PyUnicode_AS_UNICODE(temp);
5460		len = PyUnicode_GET_SIZE(temp);
5461		if (prec >= 0 && len > prec)
5462		    len = prec;
5463		break;
5464
5465	    case 'i':
5466	    case 'd':
5467	    case 'u':
5468	    case 'o':
5469	    case 'x':
5470	    case 'X':
5471		if (c == 'i')
5472		    c = 'd';
5473		if (PyLong_Check(v)) {
5474		    temp = formatlong(v, flags, prec, c);
5475		    if (!temp)
5476			goto onError;
5477		    pbuf = PyUnicode_AS_UNICODE(temp);
5478		    len = PyUnicode_GET_SIZE(temp);
5479		    /* unbounded ints can always produce
5480		       a sign character! */
5481		    sign = 1;
5482		}
5483		else {
5484		    pbuf = formatbuf;
5485		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5486				    flags, prec, c, v);
5487		    if (len < 0)
5488			goto onError;
5489		    /* only d conversion is signed */
5490		    sign = c == 'd';
5491		}
5492		if (flags & F_ZERO)
5493		    fill = '0';
5494		break;
5495
5496	    case 'e':
5497	    case 'E':
5498	    case 'f':
5499	    case 'g':
5500	    case 'G':
5501		pbuf = formatbuf;
5502		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5503			flags, prec, c, v);
5504		if (len < 0)
5505		    goto onError;
5506		sign = 1;
5507		if (flags & F_ZERO)
5508		    fill = '0';
5509		break;
5510
5511	    case 'c':
5512		pbuf = formatbuf;
5513		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5514		if (len < 0)
5515		    goto onError;
5516		break;
5517
5518	    default:
5519		PyErr_Format(PyExc_ValueError,
5520			     "unsupported format character '%c' (0x%x) "
5521			     "at index %i",
5522			     (31<=c && c<=126) ? c : '?',
5523                             c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5524		goto onError;
5525	    }
5526	    if (sign) {
5527		if (*pbuf == '-' || *pbuf == '+') {
5528		    sign = *pbuf++;
5529		    len--;
5530		}
5531		else if (flags & F_SIGN)
5532		    sign = '+';
5533		else if (flags & F_BLANK)
5534		    sign = ' ';
5535		else
5536		    sign = 0;
5537	    }
5538	    if (width < len)
5539		width = len;
5540	    if (rescnt < width + (sign != 0)) {
5541		reslen -= rescnt;
5542		rescnt = width + fmtcnt + 100;
5543		reslen += rescnt;
5544		if (_PyUnicode_Resize(&result, reslen) < 0)
5545		    return NULL;
5546		res = PyUnicode_AS_UNICODE(result)
5547		    + reslen - rescnt;
5548	    }
5549	    if (sign) {
5550		if (fill != ' ')
5551		    *res++ = sign;
5552		rescnt--;
5553		if (width > len)
5554		    width--;
5555	    }
5556	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5557		assert(pbuf[0] == '0');
5558		assert(pbuf[1] == c);
5559		if (fill != ' ') {
5560		    *res++ = *pbuf++;
5561		    *res++ = *pbuf++;
5562		}
5563		rescnt -= 2;
5564		width -= 2;
5565		if (width < 0)
5566		    width = 0;
5567		len -= 2;
5568	    }
5569	    if (width > len && !(flags & F_LJUST)) {
5570		do {
5571		    --rescnt;
5572		    *res++ = fill;
5573		} while (--width > len);
5574	    }
5575	    if (fill == ' ') {
5576		if (sign)
5577		    *res++ = sign;
5578		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5579		    assert(pbuf[0] == '0');
5580		    assert(pbuf[1] == c);
5581		    *res++ = *pbuf++;
5582		    *res++ = *pbuf++;
5583		}
5584	    }
5585	    Py_UNICODE_COPY(res, pbuf, len);
5586	    res += len;
5587	    rescnt -= len;
5588	    while (--width >= len) {
5589		--rescnt;
5590		*res++ = ' ';
5591	    }
5592	    if (dict && (argidx < arglen) && c != '%') {
5593		PyErr_SetString(PyExc_TypeError,
5594				"not all arguments converted");
5595		goto onError;
5596	    }
5597	    Py_XDECREF(temp);
5598	} /* '%' */
5599    } /* until end */
5600    if (argidx < arglen && !dict) {
5601	PyErr_SetString(PyExc_TypeError,
5602			"not all arguments converted");
5603	goto onError;
5604    }
5605
5606    if (args_owned) {
5607	Py_DECREF(args);
5608    }
5609    Py_DECREF(uformat);
5610    if (_PyUnicode_Resize(&result, reslen - rescnt))
5611	goto onError;
5612    return (PyObject *)result;
5613
5614 onError:
5615    Py_XDECREF(result);
5616    Py_DECREF(uformat);
5617    if (args_owned) {
5618	Py_DECREF(args);
5619    }
5620    return NULL;
5621}
5622
5623static PyBufferProcs unicode_as_buffer = {
5624    (getreadbufferproc) unicode_buffer_getreadbuf,
5625    (getwritebufferproc) unicode_buffer_getwritebuf,
5626    (getsegcountproc) unicode_buffer_getsegcount,
5627    (getcharbufferproc) unicode_buffer_getcharbuf,
5628};
5629
5630staticforward PyObject *
5631unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5632
5633static PyObject *
5634unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5635{
5636        PyObject *x = NULL;
5637	static char *kwlist[] = {"string", "encoding", "errors", 0};
5638	char *encoding = NULL;
5639	char *errors = NULL;
5640
5641	if (type != &PyUnicode_Type)
5642		return unicode_subtype_new(type, args, kwds);
5643	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5644					  kwlist, &x, &encoding, &errors))
5645	    return NULL;
5646	if (x == NULL)
5647		return (PyObject *)_PyUnicode_New(0);
5648	if (encoding == NULL && errors == NULL)
5649	    return PyObject_Unicode(x);
5650	else
5651	return PyUnicode_FromEncodedObject(x, encoding, errors);
5652}
5653
5654static PyObject *
5655unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5656{
5657	PyUnicodeObject *tmp, *pnew;
5658	int n;
5659
5660	assert(PyType_IsSubtype(type, &PyUnicode_Type));
5661	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5662	if (tmp == NULL)
5663		return NULL;
5664	assert(PyUnicode_Check(tmp));
5665	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5666	if (pnew == NULL)
5667		return NULL;
5668	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5669	if (pnew->str == NULL) {
5670		_Py_ForgetReference((PyObject *)pnew);
5671		PyObject_DEL(pnew);
5672		return NULL;
5673	}
5674	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5675	pnew->length = n;
5676	pnew->hash = tmp->hash;
5677	Py_DECREF(tmp);
5678	return (PyObject *)pnew;
5679}
5680
5681static char unicode_doc[] =
5682"unicode(string [, encoding[, errors]]) -> object\n\
5683\n\
5684Create a new Unicode object from the given encoded string.\n\
5685encoding defaults to the current default string encoding and \n\
5686errors, defining the error handling, to 'strict'.";
5687
5688PyTypeObject PyUnicode_Type = {
5689    PyObject_HEAD_INIT(&PyType_Type)
5690    0, 					/* ob_size */
5691    "unicode", 				/* tp_name */
5692    sizeof(PyUnicodeObject), 		/* tp_size */
5693    0, 					/* tp_itemsize */
5694    /* Slots */
5695    (destructor)unicode_dealloc, 	/* tp_dealloc */
5696    0, 					/* tp_print */
5697    0,				 	/* tp_getattr */
5698    0, 					/* tp_setattr */
5699    (cmpfunc) unicode_compare, 		/* tp_compare */
5700    (reprfunc) unicode_repr, 		/* tp_repr */
5701    0, 					/* tp_as_number */
5702    &unicode_as_sequence, 		/* tp_as_sequence */
5703    0, 					/* tp_as_mapping */
5704    (hashfunc) unicode_hash, 		/* tp_hash*/
5705    0, 					/* tp_call*/
5706    (reprfunc) unicode_str,	 	/* tp_str */
5707    PyObject_GenericGetAttr, 		/* tp_getattro */
5708    0,			 		/* tp_setattro */
5709    &unicode_as_buffer,			/* tp_as_buffer */
5710    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
5711    unicode_doc,			/* tp_doc */
5712    0,					/* tp_traverse */
5713    0,					/* tp_clear */
5714    0,					/* tp_richcompare */
5715    0,					/* tp_weaklistoffset */
5716    0,					/* tp_iter */
5717    0,					/* tp_iternext */
5718    unicode_methods,			/* tp_methods */
5719    0,					/* tp_members */
5720    0,					/* tp_getset */
5721    0,					/* tp_base */
5722    0,					/* tp_dict */
5723    0,					/* tp_descr_get */
5724    0,					/* tp_descr_set */
5725    0,					/* tp_dictoffset */
5726    0,					/* tp_init */
5727    0,					/* tp_alloc */
5728    unicode_new,			/* tp_new */
5729    _PyObject_Del,			/* tp_free */
5730};
5731
5732/* Initialize the Unicode implementation */
5733
5734void _PyUnicode_Init(void)
5735{
5736    int i;
5737
5738    /* Init the implementation */
5739    unicode_freelist = NULL;
5740    unicode_freelist_size = 0;
5741    unicode_empty = _PyUnicode_New(0);
5742    strcpy(unicode_default_encoding, "ascii");
5743    for (i = 0; i < 256; i++)
5744	unicode_latin1[i] = NULL;
5745}
5746
5747/* Finalize the Unicode implementation */
5748
5749void
5750_PyUnicode_Fini(void)
5751{
5752    PyUnicodeObject *u;
5753    int i;
5754
5755    Py_XDECREF(unicode_empty);
5756    unicode_empty = NULL;
5757
5758    for (i = 0; i < 256; i++) {
5759	if (unicode_latin1[i]) {
5760	    Py_DECREF(unicode_latin1[i]);
5761	    unicode_latin1[i] = NULL;
5762	}
5763    }
5764
5765    for (u = unicode_freelist; u != NULL;) {
5766	PyUnicodeObject *v = u;
5767	u = *(PyUnicodeObject **)u;
5768	if (v->str)
5769	    PyMem_DEL(v->str);
5770	Py_XDECREF(v->defenc);
5771	PyObject_DEL(v);
5772    }
5773    unicode_freelist = NULL;
5774    unicode_freelist_size = 0;
5775}
5776