unicodeobject.c revision dcc819a5c9e3cb60eba05a3c0b2547bc1fb28b80
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9--------------------------------------------------------------------
10The original string type implementation is:
11
12    Copyright (c) 1999 by Secret Labs AB
13    Copyright (c) 1999 by Fredrik Lundh
14
15By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
38
39#include "Python.h"
40
41#include "unicodeobject.h"
42#include "ucnhash.h"
43
44#ifdef MS_WIN32
45#include <windows.h>
46#endif
47
48/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE       1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54   The implementation will keep allocated Unicode memory intact for
55   all objects on the free list having a size less than this
56   limit. This reduces malloc() overhead for small Unicode objects.
57
58   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60   malloc()-overhead) bytes of unused garbage.
61
62   Setting the limit to 0 effectively turns the feature off.
63
64   Note: This is an experimental feature ! If you get core dumps when
65   using Unicode objects, turn this feature off.
66
67*/
68
69#define KEEPALIVE_SIZE_LIMIT       9
70
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
79/* --- Globals ------------------------------------------------------------
80
81   The globals are initialized by the _PyUnicode_Init() API and should
82   not be used before calling that API.
83
84*/
85
86/* Free list for Unicode objects */
87static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
89
90/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94   shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
97/* Default encoding to use and assume when NULL is passed as encoding
98   parameter; it is initialized by _PyUnicode_Init().
99
100   Always use the PyUnicode_SetDefaultEncoding() and
101   PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
104static char unicode_default_encoding[100];
105
106Py_UNICODE
107PyUnicode_GetMax(void)
108{
109#ifdef Py_UNICODE_WIDE
110	return 0x10FFFF;
111#else
112	/* This is actually an illegal character, so it should
113	   not be passed to unichr. */
114	return 0xFFFF;
115#endif
116}
117
118/* --- Unicode Object ----------------------------------------------------- */
119
120static
121int unicode_resize(register PyUnicodeObject *unicode,
122                      int length)
123{
124    void *oldstr;
125
126    /* Shortcut if there's nothing much to do. */
127    if (unicode->length == length)
128	goto reset;
129
130    /* Resizing shared object (unicode_empty or single character
131       objects) in-place is not allowed. Use PyUnicode_Resize()
132       instead ! */
133    if (unicode == unicode_empty ||
134	(unicode->length == 1 &&
135	 unicode->str[0] < 256 &&
136	 unicode_latin1[unicode->str[0]] == unicode)) {
137        PyErr_SetString(PyExc_SystemError,
138                        "can't resize shared unicode objects");
139        return -1;
140    }
141
142    /* We allocate one more byte to make sure the string is
143       Ux0000 terminated -- XXX is this needed ? */
144    oldstr = unicode->str;
145    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146    if (!unicode->str) {
147	unicode->str = oldstr;
148        PyErr_NoMemory();
149        return -1;
150    }
151    unicode->str[length] = 0;
152    unicode->length = length;
153
154 reset:
155    /* Reset the object caches */
156    if (unicode->defenc) {
157        Py_DECREF(unicode->defenc);
158        unicode->defenc = NULL;
159    }
160    unicode->hash = -1;
161
162    return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166   Ux0000 terminated -- XXX is this needed ?
167
168   XXX This allocator could further be enhanced by assuring that the
169       free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176    register PyUnicodeObject *unicode;
177
178    /* Optimization for empty strings */
179    if (length == 0 && unicode_empty != NULL) {
180        Py_INCREF(unicode_empty);
181        return unicode_empty;
182    }
183
184    /* Unicode freelist & memory allocation */
185    if (unicode_freelist) {
186        unicode = unicode_freelist;
187        unicode_freelist = *(PyUnicodeObject **)unicode;
188        unicode_freelist_size--;
189	if (unicode->str) {
190	    /* Keep-Alive optimization: we only upsize the buffer,
191	       never downsize it. */
192	    if ((unicode->length < length) &&
193		unicode_resize(unicode, length)) {
194		PyMem_DEL(unicode->str);
195		goto onError;
196	    }
197	}
198        else {
199	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200        }
201        PyObject_INIT(unicode, &PyUnicode_Type);
202    }
203    else {
204        unicode = PyMalloc_New(PyUnicodeObject, &PyUnicode_Type);
205        if (unicode == NULL)
206            return NULL;
207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208    }
209
210    if (!unicode->str) {
211	PyErr_NoMemory();
212	goto onError;
213    }
214    unicode->str[length] = 0;
215    unicode->length = length;
216    unicode->hash = -1;
217    unicode->defenc = NULL;
218    return unicode;
219
220 onError:
221    _Py_ForgetReference((PyObject *)unicode);
222    PyMalloc_Del(unicode);
223    return NULL;
224}
225
226static
227void unicode_dealloc(register PyUnicodeObject *unicode)
228{
229    if (PyUnicode_CheckExact(unicode) &&
230	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
231        /* Keep-Alive optimization */
232	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
233	    PyMem_DEL(unicode->str);
234	    unicode->str = NULL;
235	    unicode->length = 0;
236	}
237	if (unicode->defenc) {
238	    Py_DECREF(unicode->defenc);
239	    unicode->defenc = NULL;
240	}
241	/* Add to free list */
242        *(PyUnicodeObject **)unicode = unicode_freelist;
243        unicode_freelist = unicode;
244        unicode_freelist_size++;
245    }
246    else {
247	PyMem_DEL(unicode->str);
248	Py_XDECREF(unicode->defenc);
249	unicode->ob_type->tp_free((PyObject *)unicode);
250    }
251}
252
253int PyUnicode_Resize(PyObject **unicode,
254		     int length)
255{
256    register PyUnicodeObject *v;
257
258    /* Argument checks */
259    if (unicode == NULL) {
260	PyErr_BadInternalCall();
261	return -1;
262    }
263    v = (PyUnicodeObject *)*unicode;
264    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265	PyErr_BadInternalCall();
266	return -1;
267    }
268
269    /* Resizing unicode_empty and single character objects is not
270       possible since these are being shared. We simply return a fresh
271       copy with the same Unicode content. */
272    if (v->length != length &&
273	(v == unicode_empty || v->length == 1)) {
274	PyUnicodeObject *w = _PyUnicode_New(length);
275	if (w == NULL)
276	    return -1;
277	Py_UNICODE_COPY(w->str, v->str,
278			length < v->length ? length : v->length);
279	*unicode = (PyObject *)w;
280	return 0;
281    }
282
283    /* Note that we don't have to modify *unicode for unshared Unicode
284       objects, since we can modify them in-place. */
285    return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293				int size)
294{
295    PyUnicodeObject *unicode;
296
297    /* If the Unicode data is known at construction time, we can apply
298       some optimizations which share commonly used objects. */
299    if (u != NULL) {
300
301	/* Optimization for empty strings */
302	if (size == 0 && unicode_empty != NULL) {
303	    Py_INCREF(unicode_empty);
304	    return (PyObject *)unicode_empty;
305	}
306
307	/* Single character Unicode objects in the Latin-1 range are
308	   shared when using this constructor */
309	if (size == 1 && *u < 256) {
310	    unicode = unicode_latin1[*u];
311	    if (!unicode) {
312		unicode = _PyUnicode_New(1);
313		if (!unicode)
314		    return NULL;
315		unicode->str[0] = *u;
316		unicode_latin1[*u] = unicode;
317	    }
318	    Py_INCREF(unicode);
319	    return (PyObject *)unicode;
320	}
321    }
322
323    unicode = _PyUnicode_New(size);
324    if (!unicode)
325        return NULL;
326
327    /* Copy the Unicode data into the new object */
328    if (u != NULL)
329	Py_UNICODE_COPY(unicode->str, u, size);
330
331    return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337				 int size)
338{
339    PyUnicodeObject *unicode;
340
341    if (w == NULL) {
342	PyErr_BadInternalCall();
343	return NULL;
344    }
345
346    unicode = _PyUnicode_New(size);
347    if (!unicode)
348        return NULL;
349
350    /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352    memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354    {
355	register Py_UNICODE *u;
356	register int i;
357	u = PyUnicode_AS_UNICODE(unicode);
358	for (i = size; i >= 0; i--)
359	    *u++ = *w++;
360    }
361#endif
362
363    return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367			 register wchar_t *w,
368			 int size)
369{
370    if (unicode == NULL) {
371	PyErr_BadInternalCall();
372	return -1;
373    }
374    if (size > PyUnicode_GET_SIZE(unicode))
375	size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377    memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379    {
380	register Py_UNICODE *u;
381	register int i;
382	u = PyUnicode_AS_UNICODE(unicode);
383	for (i = size; i >= 0; i--)
384	    *w++ = *u++;
385    }
386#endif
387
388    return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
395    /* XXX Perhaps we should make this API an alias of
396           PyObject_Unicode() instead ?! */
397    if (PyUnicode_CheckExact(obj)) {
398	Py_INCREF(obj);
399	return obj;
400    }
401    if (PyUnicode_Check(obj)) {
402	/* For a Unicode subtype that's not a Unicode object,
403	   return a true Unicode object with the same data. */
404	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405				     PyUnicode_GET_SIZE(obj));
406    }
407    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411				      const char *encoding,
412				      const char *errors)
413{
414    const char *s = NULL;
415    int len;
416    int owned = 0;
417    PyObject *v;
418
419    if (obj == NULL) {
420	PyErr_BadInternalCall();
421	return NULL;
422    }
423
424#if 0
425    /* For b/w compatibility we also accept Unicode objects provided
426       that no encodings is given and then redirect to
427       PyObject_Unicode() which then applies the additional logic for
428       Unicode subclasses.
429
430       NOTE: This API should really only be used for object which
431             represent *encoded* Unicode !
432
433    */
434	if (PyUnicode_Check(obj)) {
435	    if (encoding) {
436		PyErr_SetString(PyExc_TypeError,
437				"decoding Unicode is not supported");
438	    return NULL;
439	    }
440	return PyObject_Unicode(obj);
441	    }
442#else
443    if (PyUnicode_Check(obj)) {
444	PyErr_SetString(PyExc_TypeError,
445			"decoding Unicode is not supported");
446	return NULL;
447	}
448#endif
449
450    /* Coerce object */
451    if (PyString_Check(obj)) {
452	    s = PyString_AS_STRING(obj);
453	    len = PyString_GET_SIZE(obj);
454	    }
455    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456	/* Overwrite the error message with something more useful in
457	   case of a TypeError. */
458	if (PyErr_ExceptionMatches(PyExc_TypeError))
459	PyErr_Format(PyExc_TypeError,
460			 "coercing to Unicode: need string or buffer, "
461			 "%.80s found",
462		     obj->ob_type->tp_name);
463	goto onError;
464    }
465
466    /* Convert to Unicode */
467    if (len == 0) {
468	Py_INCREF(unicode_empty);
469	v = (PyObject *)unicode_empty;
470    }
471    else
472	v = PyUnicode_Decode(s, len, encoding, errors);
473
474    if (owned) {
475	Py_DECREF(obj);
476    }
477    return v;
478
479 onError:
480    if (owned) {
481	Py_DECREF(obj);
482    }
483    return NULL;
484}
485
486PyObject *PyUnicode_Decode(const char *s,
487			   int size,
488			   const char *encoding,
489			   const char *errors)
490{
491    PyObject *buffer = NULL, *unicode;
492
493    if (encoding == NULL)
494	encoding = PyUnicode_GetDefaultEncoding();
495
496    /* Shortcuts for common default encodings */
497    if (strcmp(encoding, "utf-8") == 0)
498        return PyUnicode_DecodeUTF8(s, size, errors);
499    else if (strcmp(encoding, "latin-1") == 0)
500        return PyUnicode_DecodeLatin1(s, size, errors);
501    else if (strcmp(encoding, "ascii") == 0)
502        return PyUnicode_DecodeASCII(s, size, errors);
503
504    /* Decode via the codec registry */
505    buffer = PyBuffer_FromMemory((void *)s, size);
506    if (buffer == NULL)
507        goto onError;
508    unicode = PyCodec_Decode(buffer, encoding, errors);
509    if (unicode == NULL)
510        goto onError;
511    if (!PyUnicode_Check(unicode)) {
512        PyErr_Format(PyExc_TypeError,
513                     "decoder did not return an unicode object (type=%.400s)",
514                     unicode->ob_type->tp_name);
515        Py_DECREF(unicode);
516        goto onError;
517    }
518    Py_DECREF(buffer);
519    return unicode;
520
521 onError:
522    Py_XDECREF(buffer);
523    return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527			   int size,
528			   const char *encoding,
529			   const char *errors)
530{
531    PyObject *v, *unicode;
532
533    unicode = PyUnicode_FromUnicode(s, size);
534    if (unicode == NULL)
535	return NULL;
536    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537    Py_DECREF(unicode);
538    return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542                                    const char *encoding,
543                                    const char *errors)
544{
545    PyObject *v;
546
547    if (!PyUnicode_Check(unicode)) {
548        PyErr_BadArgument();
549        goto onError;
550    }
551
552    if (encoding == NULL)
553	encoding = PyUnicode_GetDefaultEncoding();
554
555    /* Shortcuts for common default encodings */
556    if (errors == NULL) {
557	if (strcmp(encoding, "utf-8") == 0)
558	    return PyUnicode_AsUTF8String(unicode);
559	else if (strcmp(encoding, "latin-1") == 0)
560	    return PyUnicode_AsLatin1String(unicode);
561	else if (strcmp(encoding, "ascii") == 0)
562	    return PyUnicode_AsASCIIString(unicode);
563    }
564
565    /* Encode via the codec registry */
566    v = PyCodec_Encode(unicode, encoding, errors);
567    if (v == NULL)
568        goto onError;
569    /* XXX Should we really enforce this ? */
570    if (!PyString_Check(v)) {
571        PyErr_Format(PyExc_TypeError,
572                     "encoder did not return a string object (type=%.400s)",
573                     v->ob_type->tp_name);
574        Py_DECREF(v);
575        goto onError;
576    }
577    return v;
578
579 onError:
580    return NULL;
581}
582
583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584					    const char *errors)
585{
586    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588    if (v)
589        return v;
590    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591    if (v && errors == NULL)
592        ((PyUnicodeObject *)unicode)->defenc = v;
593    return v;
594}
595
596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598    if (!PyUnicode_Check(unicode)) {
599        PyErr_BadArgument();
600        goto onError;
601    }
602    return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605    return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610    if (!PyUnicode_Check(unicode)) {
611        PyErr_BadArgument();
612        goto onError;
613    }
614    return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617    return -1;
618}
619
620const char *PyUnicode_GetDefaultEncoding(void)
621{
622    return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627    PyObject *v;
628
629    /* Make sure the encoding is valid. As side effect, this also
630       loads the encoding into the codec registry cache. */
631    v = _PyCodec_Lookup(encoding);
632    if (v == NULL)
633	goto onError;
634    Py_DECREF(v);
635    strncpy(unicode_default_encoding,
636	    encoding,
637	    sizeof(unicode_default_encoding));
638    return 0;
639
640 onError:
641    return -1;
642}
643
644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650    /* indicate whether a UTF-7 character is special i.e. cannot be directly
651       encoded:
652	   0 - not special
653	   1 - special
654	   2 - whitespace (optional)
655	   3 - RFC2152 Set O (optional) */
656    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668	(((c)>127 || utf7_special[(c)] == 1) || \
669	 (encodeWS && (utf7_special[(c)] == 2)) || \
670     (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675                        (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678    while (bits >= 6) { \
679        *out++ = B64(ch >> (bits-6)); \
680        bits -= 6; \
681    }
682
683#define DECODE(out, ch, bits, surrogate) \
684    while (bits >= 16) { \
685        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686        bits -= 16; \
687		if (surrogate) { \
688			/* We have already generated an error for the high surrogate
689               so let's not bother seeing if the low surrogate is correct or not */\
690			surrogate = 0; \
691		} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692            /* This is a surrogate pair. Unfortunately we can't represent \
693               it in a 16-bit character */ \
694			surrogate = 1; \
695            errmsg = "code pairs are not supported"; \
696	        goto utf7Error; \
697		} else { \
698				*out++ = outCh; \
699		} \
700    } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704                        const char *errors,
705                        const char *details)
706{
707    if ((errors == NULL) ||
708        (strcmp(errors,"strict") == 0)) {
709        PyErr_Format(PyExc_UnicodeError,
710                     "UTF-7 decoding error: %.400s",
711                     details);
712        return -1;
713    }
714    else if (strcmp(errors,"ignore") == 0) {
715        return 0;
716    }
717    else if (strcmp(errors,"replace") == 0) {
718        if (dest != NULL) {
719            **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720            (*dest)++;
721        }
722        return 0;
723    }
724    else {
725        PyErr_Format(PyExc_ValueError,
726                     "UTF-7 decoding error; unknown error handling code: %.400s",
727                     errors);
728        return -1;
729    }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733			       int size,
734			       const char *errors)
735{
736    const char *e;
737    PyUnicodeObject *unicode;
738    Py_UNICODE *p;
739    const char *errmsg = "";
740    int inShift = 0;
741    unsigned int bitsleft = 0;
742    unsigned long charsleft = 0;
743	int surrogate = 0;
744
745    unicode = _PyUnicode_New(size);
746    if (!unicode)
747        return NULL;
748    if (size == 0)
749        return (PyObject *)unicode;
750
751    p = unicode->str;
752    e = s + size;
753
754    while (s < e) {
755        Py_UNICODE ch = *s;
756
757        if (inShift) {
758            if ((ch == '-') || !B64CHAR(ch)) {
759                inShift = 0;
760                s++;
761
762                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763                if (bitsleft >= 6) {
764                    /* The shift sequence has a partial character in it. If
765                       bitsleft < 6 then we could just classify it as padding
766                       but that is not the case here */
767
768                    errmsg = "partial character in shift sequence";
769                    goto utf7Error;
770                }
771                /* According to RFC2152 the remaining bits should be zero. We
772                   choose to signal an error/insert a replacement character
773                   here so indicate the potential of a misencoded character. */
774
775                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777                    errmsg = "non-zero padding bits in shift sequence";
778                    goto utf7Error;
779                }
780
781                if (ch == '-') {
782                    if ((s < e) && (*(s) == '-')) {
783                        *p++ = '-';
784                        inShift = 1;
785                    }
786                } else if (SPECIAL(ch,0,0)) {
787                    errmsg = "unexpected special character";
788	                goto utf7Error;
789                } else  {
790                    *p++ = ch;
791                }
792            } else {
793                charsleft = (charsleft << 6) | UB64(ch);
794                bitsleft += 6;
795                s++;
796                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797            }
798        }
799        else if ( ch == '+' ) {
800            s++;
801            if (s < e && *s == '-') {
802                s++;
803                *p++ = '+';
804            } else
805            {
806                inShift = 1;
807                bitsleft = 0;
808            }
809        }
810        else if (SPECIAL(ch,0,0)) {
811            errmsg = "unexpected special character";
812            s++;
813	        goto utf7Error;
814        }
815        else {
816            *p++ = ch;
817            s++;
818        }
819        continue;
820    utf7Error:
821      if (utf7_decoding_error(&p, errors, errmsg))
822          goto onError;
823    }
824
825    if (inShift) {
826        if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827            goto onError;
828    }
829
830    if (_PyUnicode_Resize(&unicode, p - unicode->str))
831        goto onError;
832
833    return (PyObject *)unicode;
834
835onError:
836    Py_DECREF(unicode);
837    return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842                   int size,
843                   int encodeSetO,
844                   int encodeWhiteSpace,
845                   const char *errors)
846{
847    PyObject *v;
848    /* It might be possible to tighten this worst case */
849    unsigned int cbAllocated = 5 * size;
850    int inShift = 0;
851    int i = 0;
852    unsigned int bitsleft = 0;
853    unsigned long charsleft = 0;
854    char * out;
855    char * start;
856
857    if (size == 0)
858		return PyString_FromStringAndSize(NULL, 0);
859
860    v = PyString_FromStringAndSize(NULL, cbAllocated);
861    if (v == NULL)
862        return NULL;
863
864    start = out = PyString_AS_STRING(v);
865    for (;i < size; ++i) {
866        Py_UNICODE ch = s[i];
867
868        if (!inShift) {
869			if (ch == '+') {
870				*out++ = '+';
871                *out++ = '-';
872            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873                charsleft = ch;
874                bitsleft = 16;
875                *out++ = '+';
876				/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877                inShift = bitsleft > 0;
878			} else {
879				*out++ = (char) ch;
880			}
881		} else {
882            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883                *out++ = B64(charsleft << (6-bitsleft));
884                charsleft = 0;
885                bitsleft = 0;
886                /* Characters not in the BASE64 set implicitly unshift the sequence
887                   so no '-' is required, except if the character is itself a '-' */
888                if (B64CHAR(ch) || ch == '-') {
889                    *out++ = '-';
890                }
891                inShift = 0;
892                *out++ = (char) ch;
893            } else {
894                bitsleft += 16;
895                charsleft = (charsleft << 16) | ch;
896                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898                /* If the next character is special then we dont' need to terminate
899                   the shift sequence. If the next character is not a BASE64 character
900                   or '-' then the shift sequence will be terminated implicitly and we
901                   don't have to insert a '-'. */
902
903                if (bitsleft == 0) {
904                    if (i + 1 < size) {
905                        Py_UNICODE ch2 = s[i+1];
906
907                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909                        } else if (B64CHAR(ch2) || ch2 == '-') {
910                            *out++ = '-';
911                            inShift = 0;
912                        } else {
913                            inShift = 0;
914                        }
915
916                    }
917                    else {
918                        *out++ = '-';
919                        inShift = 0;
920                    }
921                }
922            }
923        }
924	}
925    if (bitsleft) {
926        *out++= B64(charsleft << (6-bitsleft) );
927        *out++ = '-';
928    }
929
930    if (_PyString_Resize(&v, out - start)) {
931        Py_DECREF(v);
932        return NULL;
933    }
934    return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
949       illegal prefix.  see RFC 2279 for details */
950    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970                        Py_UNICODE **dest,
971                        const char *errors,
972                        const char *details)
973{
974    if ((errors == NULL) ||
975        (strcmp(errors,"strict") == 0)) {
976        PyErr_Format(PyExc_UnicodeError,
977                     "UTF-8 decoding error: %.400s",
978                     details);
979        return -1;
980    }
981    else if (strcmp(errors,"ignore") == 0) {
982        (*source)++;
983        return 0;
984    }
985    else if (strcmp(errors,"replace") == 0) {
986        (*source)++;
987        **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988        (*dest)++;
989        return 0;
990    }
991    else {
992        PyErr_Format(PyExc_ValueError,
993                     "UTF-8 decoding error; unknown error handling code: %.400s",
994                     errors);
995        return -1;
996    }
997}
998
999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000			       int size,
1001			       const char *errors)
1002{
1003    int n;
1004    const char *e;
1005    PyUnicodeObject *unicode;
1006    Py_UNICODE *p;
1007    const char *errmsg = "";
1008
1009    /* Note: size will always be longer than the resulting Unicode
1010       character count */
1011    unicode = _PyUnicode_New(size);
1012    if (!unicode)
1013        return NULL;
1014    if (size == 0)
1015        return (PyObject *)unicode;
1016
1017    /* Unpack UTF-8 encoded data */
1018    p = unicode->str;
1019    e = s + size;
1020
1021    while (s < e) {
1022        Py_UCS4 ch = (unsigned char)*s;
1023
1024        if (ch < 0x80) {
1025            *p++ = (Py_UNICODE)ch;
1026            s++;
1027            continue;
1028        }
1029
1030        n = utf8_code_length[ch];
1031
1032        if (s + n > e) {
1033	    errmsg = "unexpected end of data";
1034	    goto utf8Error;
1035	}
1036
1037        switch (n) {
1038
1039        case 0:
1040            errmsg = "unexpected code byte";
1041	    goto utf8Error;
1042
1043        case 1:
1044            errmsg = "internal error";
1045	    goto utf8Error;
1046
1047        case 2:
1048            if ((s[1] & 0xc0) != 0x80) {
1049                errmsg = "invalid data";
1050		goto utf8Error;
1051	    }
1052            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1053            if (ch < 0x80) {
1054                errmsg = "illegal encoding";
1055		goto utf8Error;
1056	    }
1057	    else
1058		*p++ = (Py_UNICODE)ch;
1059            break;
1060
1061        case 3:
1062            if ((s[1] & 0xc0) != 0x80 ||
1063                (s[2] & 0xc0) != 0x80) {
1064                errmsg = "invalid data";
1065		goto utf8Error;
1066	    }
1067            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1068            if (ch < 0x0800) {
1069		/* Note: UTF-8 encodings of surrogates are considered
1070		   legal UTF-8 sequences;
1071
1072		   XXX For wide builds (UCS-4) we should probably try
1073		       to recombine the surrogates into a single code
1074		       unit.
1075		*/
1076                errmsg = "illegal encoding";
1077		goto utf8Error;
1078	    }
1079	    else
1080		*p++ = (Py_UNICODE)ch;
1081            break;
1082
1083        case 4:
1084            if ((s[1] & 0xc0) != 0x80 ||
1085                (s[2] & 0xc0) != 0x80 ||
1086                (s[3] & 0xc0) != 0x80) {
1087                errmsg = "invalid data";
1088		goto utf8Error;
1089	    }
1090            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092            /* validate and convert to UTF-16 */
1093            if ((ch < 0x10000)        /* minimum value allowed for 4
1094					 byte encoding */
1095                || (ch > 0x10ffff))   /* maximum value allowed for
1096					 UTF-16 */
1097	    {
1098                errmsg = "illegal encoding";
1099		goto utf8Error;
1100	    }
1101#ifdef Py_UNICODE_WIDE
1102	    *p++ = (Py_UNICODE)ch;
1103#else
1104            /*  compute and append the two surrogates: */
1105
1106            /*  translate from 10000..10FFFF to 0..FFFF */
1107            ch -= 0x10000;
1108
1109            /*  high surrogate = top 10 bits added to D800 */
1110            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112            /*  low surrogate = bottom 10 bits added to DC00 */
1113            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1114#endif
1115            break;
1116
1117        default:
1118            /* Other sizes are only needed for UCS-4 */
1119            errmsg = "unsupported Unicode code range";
1120	    goto utf8Error;
1121        }
1122        s += n;
1123	continue;
1124
1125    utf8Error:
1126      if (utf8_decoding_error(&s, &p, errors, errmsg))
1127          goto onError;
1128    }
1129
1130    /* Adjust length */
1131    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1132        goto onError;
1133
1134    return (PyObject *)unicode;
1135
1136onError:
1137    Py_DECREF(unicode);
1138    return NULL;
1139}
1140
1141/* Not used anymore, now that the encoder supports UTF-16
1142   surrogates. */
1143#if 0
1144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146			char **dest,
1147			const char *errors,
1148			const char *details)
1149{
1150    if ((errors == NULL) ||
1151	(strcmp(errors,"strict") == 0)) {
1152	PyErr_Format(PyExc_UnicodeError,
1153		     "UTF-8 encoding error: %.400s",
1154		     details);
1155	return -1;
1156    }
1157    else if (strcmp(errors,"ignore") == 0) {
1158	return 0;
1159    }
1160    else if (strcmp(errors,"replace") == 0) {
1161	**dest = '?';
1162	(*dest)++;
1163	return 0;
1164    }
1165    else {
1166	PyErr_Format(PyExc_ValueError,
1167		     "UTF-8 encoding error; "
1168		     "unknown error handling code: %.400s",
1169		     errors);
1170	return -1;
1171    }
1172}
1173#endif
1174
1175PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1176			       int size,
1177			       const char *errors)
1178{
1179    PyObject *v;
1180    char *p;
1181    unsigned int cbAllocated = 2 * size;
1182    unsigned int cbWritten = 0;
1183    int i = 0;
1184
1185    /* Short-cut for emtpy strings */
1186    if (size == 0)
1187	return PyString_FromStringAndSize(NULL, 0);
1188
1189    /* We allocate 4 more bytes to have room for at least one full
1190       UTF-8 sequence; saves a few cycles in the loop below */
1191    v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
1192    if (v == NULL)
1193        return NULL;
1194
1195    p = PyString_AS_STRING(v);
1196    while (i < size) {
1197        Py_UCS4 ch = s[i++];
1198
1199        if (ch < 0x80) {
1200            *p++ = (char) ch;
1201            cbWritten++;
1202        }
1203
1204        else if (ch < 0x0800) {
1205            *p++ = (char)(0xc0 | (ch >> 6));
1206            *p++ = (char)(0x80 | (ch & 0x3f));
1207            cbWritten += 2;
1208        }
1209
1210        else {
1211
1212	    /* Assure that we have enough room for high order Unicode
1213	       ordinals */
1214	    if (cbWritten >= cbAllocated) {
1215		cbAllocated += 4 * 10;
1216		if (_PyString_Resize(&v, cbAllocated + 4))
1217		    goto onError;
1218		p = PyString_AS_STRING(v) + cbWritten;
1219	    }
1220
1221	    if (ch < 0x10000) {
1222		/* Check for high surrogate */
1223		if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1224		    Py_UCS4 ch2 = s[i];
1225		    /* Check for low surrogate */
1226		    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1227                        ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1228                        *p++ = (char)((ch >> 18) | 0xf0);
1229                        *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1230			*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1231			*p++ = (char)(0x80 | (ch & 0x3f));
1232                        i++;
1233                        cbWritten += 4;
1234			continue;
1235                    }
1236		    /* Fall through: handles isolated high surrogates */
1237                }
1238                *p++ = (char)(0xe0 | (ch >> 12));
1239		*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1240		*p++ = (char)(0x80 | (ch & 0x3f));
1241		cbWritten += 3;
1242
1243	    } else {
1244		*p++ = (char)(0xf0 | (ch>>18));
1245		*p++ = (char)(0x80 | ((ch>>12) & 0x3f));
1246		*p++ = (char)(0x80 | ((ch>>6) & 0x3f));
1247		*p++ = (char)(0x80 | (ch & 0x3f));
1248		cbWritten += 4;
1249	    }
1250	}
1251    }
1252    *p = '\0';
1253    if (_PyString_Resize(&v, cbWritten))
1254	goto onError;
1255    return v;
1256
1257 onError:
1258    Py_DECREF(v);
1259    return NULL;
1260}
1261
1262PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1263{
1264    if (!PyUnicode_Check(unicode)) {
1265        PyErr_BadArgument();
1266        return NULL;
1267    }
1268    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1269				PyUnicode_GET_SIZE(unicode),
1270				NULL);
1271}
1272
1273/* --- UTF-16 Codec ------------------------------------------------------- */
1274
1275static
1276int utf16_decoding_error(Py_UNICODE **dest,
1277			 const char *errors,
1278			 const char *details)
1279{
1280    if ((errors == NULL) ||
1281        (strcmp(errors,"strict") == 0)) {
1282        PyErr_Format(PyExc_UnicodeError,
1283                     "UTF-16 decoding error: %.400s",
1284                     details);
1285        return -1;
1286    }
1287    else if (strcmp(errors,"ignore") == 0) {
1288        return 0;
1289    }
1290    else if (strcmp(errors,"replace") == 0) {
1291	if (dest) {
1292	    **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1293	    (*dest)++;
1294	}
1295        return 0;
1296    }
1297    else {
1298        PyErr_Format(PyExc_ValueError,
1299                     "UTF-16 decoding error; "
1300		     "unknown error handling code: %.400s",
1301                     errors);
1302        return -1;
1303    }
1304}
1305
1306PyObject *
1307PyUnicode_DecodeUTF16(const char *s,
1308		      int size,
1309		      const char *errors,
1310		      int *byteorder)
1311{
1312    PyUnicodeObject *unicode;
1313    Py_UNICODE *p;
1314    const unsigned char *q, *e;
1315    int bo = 0;       /* assume native ordering by default */
1316    const char *errmsg = "";
1317    /* Offsets from q for retrieving byte pairs in the right order. */
1318#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1319    int ihi = 1, ilo = 0;
1320#else
1321    int ihi = 0, ilo = 1;
1322#endif
1323
1324    /* size should be an even number */
1325    if (size & 1) {
1326        if (utf16_decoding_error(NULL, errors, "truncated data"))
1327            return NULL;
1328        --size;  /* else ignore the oddball byte */
1329    }
1330
1331    /* Note: size will always be longer than the resulting Unicode
1332       character count */
1333    unicode = _PyUnicode_New(size);
1334    if (!unicode)
1335        return NULL;
1336    if (size == 0)
1337        return (PyObject *)unicode;
1338
1339    /* Unpack UTF-16 encoded data */
1340    p = unicode->str;
1341    q = (unsigned char *)s;
1342    e = q + size;
1343
1344    if (byteorder)
1345        bo = *byteorder;
1346
1347    /* Check for BOM marks (U+FEFF) in the input and adjust current
1348       byte order setting accordingly. In native mode, the leading BOM
1349       mark is skipped, in all other modes, it is copied to the output
1350       stream as-is (giving a ZWNBSP character). */
1351    if (bo == 0) {
1352        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1353#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1354	if (bom == 0xFEFF) {
1355	    q += 2;
1356	    bo = -1;
1357	}
1358        else if (bom == 0xFFFE) {
1359	    q += 2;
1360	    bo = 1;
1361	}
1362#else
1363	if (bom == 0xFEFF) {
1364	    q += 2;
1365	    bo = 1;
1366	}
1367        else if (bom == 0xFFFE) {
1368	    q += 2;
1369	    bo = -1;
1370	}
1371#endif
1372    }
1373
1374    if (bo == -1) {
1375        /* force LE */
1376        ihi = 1;
1377        ilo = 0;
1378    }
1379    else if (bo == 1) {
1380        /* force BE */
1381        ihi = 0;
1382        ilo = 1;
1383    }
1384
1385    while (q < e) {
1386	Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1387	q += 2;
1388
1389	if (ch < 0xD800 || ch > 0xDFFF) {
1390	    *p++ = ch;
1391	    continue;
1392	}
1393
1394	/* UTF-16 code pair: */
1395	if (q >= e) {
1396	    errmsg = "unexpected end of data";
1397	    goto utf16Error;
1398	}
1399	if (0xD800 <= ch && ch <= 0xDBFF) {
1400	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1401	    q += 2;
1402	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1403#ifndef Py_UNICODE_WIDE
1404		*p++ = ch;
1405		*p++ = ch2;
1406#else
1407		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1408#endif
1409		continue;
1410	    }
1411	    else {
1412                errmsg = "illegal UTF-16 surrogate";
1413		goto utf16Error;
1414	    }
1415
1416	}
1417	errmsg = "illegal encoding";
1418	/* Fall through to report the error */
1419
1420    utf16Error:
1421	if (utf16_decoding_error(&p, errors, errmsg))
1422	    goto onError;
1423    }
1424
1425    if (byteorder)
1426        *byteorder = bo;
1427
1428    /* Adjust length */
1429    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1430        goto onError;
1431
1432    return (PyObject *)unicode;
1433
1434onError:
1435    Py_DECREF(unicode);
1436    return NULL;
1437}
1438
1439PyObject *
1440PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1441		      int size,
1442		      const char *errors,
1443		      int byteorder)
1444{
1445    PyObject *v;
1446    unsigned char *p;
1447    int i, pairs;
1448    /* Offsets from p for storing byte pairs in the right order. */
1449#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1450    int ihi = 1, ilo = 0;
1451#else
1452    int ihi = 0, ilo = 1;
1453#endif
1454
1455#define STORECHAR(CH)                   \
1456    do {                                \
1457        p[ihi] = ((CH) >> 8) & 0xff;    \
1458        p[ilo] = (CH) & 0xff;           \
1459        p += 2;                         \
1460    } while(0)
1461
1462    for (i = pairs = 0; i < size; i++)
1463	if (s[i] >= 0x10000)
1464	    pairs++;
1465    v = PyString_FromStringAndSize(NULL,
1466		  2 * (size + pairs + (byteorder == 0)));
1467    if (v == NULL)
1468        return NULL;
1469
1470    p = (unsigned char *)PyString_AS_STRING(v);
1471    if (byteorder == 0)
1472	STORECHAR(0xFEFF);
1473    if (size == 0)
1474        return v;
1475
1476    if (byteorder == -1) {
1477        /* force LE */
1478        ihi = 1;
1479        ilo = 0;
1480    }
1481    else if (byteorder == 1) {
1482        /* force BE */
1483        ihi = 0;
1484        ilo = 1;
1485    }
1486
1487    while (size-- > 0) {
1488	Py_UNICODE ch = *s++;
1489	Py_UNICODE ch2 = 0;
1490	if (ch >= 0x10000) {
1491	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1492	    ch  = 0xD800 | ((ch-0x10000) >> 10);
1493	}
1494        STORECHAR(ch);
1495        if (ch2)
1496            STORECHAR(ch2);
1497    }
1498    return v;
1499#undef STORECHAR
1500}
1501
1502PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1503{
1504    if (!PyUnicode_Check(unicode)) {
1505        PyErr_BadArgument();
1506        return NULL;
1507    }
1508    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1509				 PyUnicode_GET_SIZE(unicode),
1510				 NULL,
1511				 0);
1512}
1513
1514/* --- Unicode Escape Codec ----------------------------------------------- */
1515
1516static
1517int unicodeescape_decoding_error(Py_UNICODE **x,
1518                                 const char *errors,
1519                                 const char *details)
1520{
1521    if ((errors == NULL) ||
1522        (strcmp(errors,"strict") == 0)) {
1523        PyErr_Format(PyExc_UnicodeError,
1524                     "Unicode-Escape decoding error: %.400s",
1525                     details);
1526        return -1;
1527    }
1528    else if (strcmp(errors,"ignore") == 0) {
1529        return 0;
1530    }
1531    else if (strcmp(errors,"replace") == 0) {
1532        **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1533	(*x)++;
1534        return 0;
1535    }
1536    else {
1537        PyErr_Format(PyExc_ValueError,
1538                     "Unicode-Escape decoding error; "
1539                     "unknown error handling code: %.400s",
1540                     errors);
1541        return -1;
1542    }
1543}
1544
1545static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1546
1547PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1548					int size,
1549					const char *errors)
1550{
1551    PyUnicodeObject *v;
1552    Py_UNICODE *p, *buf;
1553    const char *end;
1554    char* message;
1555    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1556
1557    /* Escaped strings will always be longer than the resulting
1558       Unicode string, so we start with size here and then reduce the
1559       length after conversion to the true value. */
1560    v = _PyUnicode_New(size);
1561    if (v == NULL)
1562        goto onError;
1563    if (size == 0)
1564        return (PyObject *)v;
1565
1566    p = buf = PyUnicode_AS_UNICODE(v);
1567    end = s + size;
1568
1569    while (s < end) {
1570        unsigned char c;
1571        Py_UNICODE x;
1572        int i, digits;
1573
1574        /* Non-escape characters are interpreted as Unicode ordinals */
1575        if (*s != '\\') {
1576            *p++ = (unsigned char) *s++;
1577            continue;
1578        }
1579
1580        /* \ - Escapes */
1581        s++;
1582        switch (*s++) {
1583
1584        /* \x escapes */
1585        case '\n': break;
1586        case '\\': *p++ = '\\'; break;
1587        case '\'': *p++ = '\''; break;
1588        case '\"': *p++ = '\"'; break;
1589        case 'b': *p++ = '\b'; break;
1590        case 'f': *p++ = '\014'; break; /* FF */
1591        case 't': *p++ = '\t'; break;
1592        case 'n': *p++ = '\n'; break;
1593        case 'r': *p++ = '\r'; break;
1594        case 'v': *p++ = '\013'; break; /* VT */
1595        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1596
1597        /* \OOO (octal) escapes */
1598        case '0': case '1': case '2': case '3':
1599        case '4': case '5': case '6': case '7':
1600            x = s[-1] - '0';
1601            if ('0' <= *s && *s <= '7') {
1602                x = (x<<3) + *s++ - '0';
1603                if ('0' <= *s && *s <= '7')
1604                    x = (x<<3) + *s++ - '0';
1605            }
1606            *p++ = x;
1607            break;
1608
1609        /* hex escapes */
1610        /* \xXX */
1611        case 'x':
1612            digits = 2;
1613            message = "truncated \\xXX escape";
1614            goto hexescape;
1615
1616        /* \uXXXX */
1617        case 'u':
1618            digits = 4;
1619            message = "truncated \\uXXXX escape";
1620            goto hexescape;
1621
1622        /* \UXXXXXXXX */
1623        case 'U':
1624            digits = 8;
1625            message = "truncated \\UXXXXXXXX escape";
1626        hexescape:
1627            chr = 0;
1628            for (i = 0; i < digits; i++) {
1629                c = (unsigned char) s[i];
1630                if (!isxdigit(c)) {
1631                    if (unicodeescape_decoding_error(&p, errors, message))
1632                        goto onError;
1633                    chr = 0xffffffff;
1634                    i++;
1635                    break;
1636                }
1637                chr = (chr<<4) & ~0xF;
1638                if (c >= '0' && c <= '9')
1639                    chr += c - '0';
1640                else if (c >= 'a' && c <= 'f')
1641                    chr += 10 + c - 'a';
1642                else
1643                    chr += 10 + c - 'A';
1644            }
1645            s += i;
1646	    if (chr == 0xffffffff)
1647		    /* _decoding_error will have already written into the
1648		       target buffer. */
1649		    break;
1650        store:
1651            /* when we get here, chr is a 32-bit unicode character */
1652            if (chr <= 0xffff)
1653                /* UCS-2 character */
1654                *p++ = (Py_UNICODE) chr;
1655            else if (chr <= 0x10ffff) {
1656                /* UCS-4 character. Either store directly, or as
1657		   surrogate pair. */
1658#ifdef Py_UNICODE_WIDE
1659                *p++ = chr;
1660#else
1661                chr -= 0x10000L;
1662                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1663                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1664#endif
1665            } else {
1666                if (unicodeescape_decoding_error(
1667                    &p, errors,
1668                    "illegal Unicode character")
1669                    )
1670                    goto onError;
1671            }
1672            break;
1673
1674        /* \N{name} */
1675        case 'N':
1676            message = "malformed \\N character escape";
1677            if (ucnhash_CAPI == NULL) {
1678                /* load the unicode data module */
1679                PyObject *m, *v;
1680                m = PyImport_ImportModule("unicodedata");
1681                if (m == NULL)
1682                    goto ucnhashError;
1683                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1684                Py_DECREF(m);
1685                if (v == NULL)
1686                    goto ucnhashError;
1687                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1688                Py_DECREF(v);
1689                if (ucnhash_CAPI == NULL)
1690                    goto ucnhashError;
1691            }
1692            if (*s == '{') {
1693                const char *start = s+1;
1694                /* look for the closing brace */
1695                while (*s != '}' && s < end)
1696                    s++;
1697                if (s > start && s < end && *s == '}') {
1698                    /* found a name.  look it up in the unicode database */
1699                    message = "unknown Unicode character name";
1700                    s++;
1701                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1702                        goto store;
1703                }
1704            }
1705            if (unicodeescape_decoding_error(&p, errors, message))
1706                goto onError;
1707            break;
1708
1709        default:
1710	    if (s > end) {
1711		if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1712		    goto onError;
1713	    }
1714	    else {
1715		*p++ = '\\';
1716		*p++ = (unsigned char)s[-1];
1717	    }
1718            break;
1719        }
1720    }
1721    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1722		goto onError;
1723    return (PyObject *)v;
1724
1725ucnhashError:
1726    PyErr_SetString(
1727        PyExc_UnicodeError,
1728        "\\N escapes not supported (can't load unicodedata module)"
1729        );
1730    return NULL;
1731
1732onError:
1733    Py_XDECREF(v);
1734    return NULL;
1735}
1736
1737/* Return a Unicode-Escape string version of the Unicode object.
1738
1739   If quotes is true, the string is enclosed in u"" or u'' quotes as
1740   appropriate.
1741
1742*/
1743
1744static const Py_UNICODE *findchar(const Py_UNICODE *s,
1745				  int size,
1746				  Py_UNICODE ch);
1747
1748static
1749PyObject *unicodeescape_string(const Py_UNICODE *s,
1750                               int size,
1751                               int quotes)
1752{
1753    PyObject *repr;
1754    char *p;
1755
1756    static const char *hexdigit = "0123456789abcdef";
1757
1758    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1759    if (repr == NULL)
1760        return NULL;
1761
1762    p = PyString_AS_STRING(repr);
1763
1764    if (quotes) {
1765        *p++ = 'u';
1766        *p++ = (findchar(s, size, '\'') &&
1767                !findchar(s, size, '"')) ? '"' : '\'';
1768    }
1769    while (size-- > 0) {
1770        Py_UNICODE ch = *s++;
1771
1772        /* Escape quotes */
1773        if (quotes &&
1774	    (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1775            *p++ = '\\';
1776            *p++ = (char) ch;
1777	    continue;
1778        }
1779
1780#ifdef Py_UNICODE_WIDE
1781        /* Map 21-bit characters to '\U00xxxxxx' */
1782        else if (ch >= 0x10000) {
1783	    int offset = p - PyString_AS_STRING(repr);
1784
1785	    /* Resize the string if necessary */
1786	    if (offset + 12 > PyString_GET_SIZE(repr)) {
1787		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1788		    goto onError;
1789		p = PyString_AS_STRING(repr) + offset;
1790	    }
1791
1792            *p++ = '\\';
1793            *p++ = 'U';
1794            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1795            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1796            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1797            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1798            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1799            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1800            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1801            *p++ = hexdigit[ch & 0x0000000F];
1802	    continue;
1803        }
1804#endif
1805	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1806	else if (ch >= 0xD800 && ch < 0xDC00) {
1807	    Py_UNICODE ch2;
1808	    Py_UCS4 ucs;
1809
1810	    ch2 = *s++;
1811	    size--;
1812	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1813		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1814		*p++ = '\\';
1815		*p++ = 'U';
1816		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1817		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1818		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1819		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1820		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1821		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1822		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1823		*p++ = hexdigit[ucs & 0x0000000F];
1824		continue;
1825	    }
1826	    /* Fall through: isolated surrogates are copied as-is */
1827	    s--;
1828	    size++;
1829	}
1830
1831        /* Map 16-bit characters to '\uxxxx' */
1832        if (ch >= 256) {
1833            *p++ = '\\';
1834            *p++ = 'u';
1835            *p++ = hexdigit[(ch >> 12) & 0x000F];
1836            *p++ = hexdigit[(ch >> 8) & 0x000F];
1837            *p++ = hexdigit[(ch >> 4) & 0x000F];
1838            *p++ = hexdigit[ch & 0x000F];
1839        }
1840
1841        /* Map special whitespace to '\t', \n', '\r' */
1842        else if (ch == '\t') {
1843            *p++ = '\\';
1844            *p++ = 't';
1845        }
1846        else if (ch == '\n') {
1847            *p++ = '\\';
1848            *p++ = 'n';
1849        }
1850        else if (ch == '\r') {
1851            *p++ = '\\';
1852            *p++ = 'r';
1853        }
1854
1855        /* Map non-printable US ASCII to '\xhh' */
1856        else if (ch < ' ' || ch >= 0x7F) {
1857            *p++ = '\\';
1858            *p++ = 'x';
1859            *p++ = hexdigit[(ch >> 4) & 0x000F];
1860            *p++ = hexdigit[ch & 0x000F];
1861        }
1862
1863        /* Copy everything else as-is */
1864        else
1865            *p++ = (char) ch;
1866    }
1867    if (quotes)
1868        *p++ = PyString_AS_STRING(repr)[1];
1869
1870    *p = '\0';
1871    if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1872	goto onError;
1873
1874    return repr;
1875
1876 onError:
1877    Py_DECREF(repr);
1878    return NULL;
1879}
1880
1881PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1882					int size)
1883{
1884    return unicodeescape_string(s, size, 0);
1885}
1886
1887PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1888{
1889    if (!PyUnicode_Check(unicode)) {
1890        PyErr_BadArgument();
1891        return NULL;
1892    }
1893    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1894					 PyUnicode_GET_SIZE(unicode));
1895}
1896
1897/* --- Raw Unicode Escape Codec ------------------------------------------- */
1898
1899PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1900					   int size,
1901					   const char *errors)
1902{
1903    PyUnicodeObject *v;
1904    Py_UNICODE *p, *buf;
1905    const char *end;
1906    const char *bs;
1907
1908    /* Escaped strings will always be longer than the resulting
1909       Unicode string, so we start with size here and then reduce the
1910       length after conversion to the true value. */
1911    v = _PyUnicode_New(size);
1912    if (v == NULL)
1913	goto onError;
1914    if (size == 0)
1915	return (PyObject *)v;
1916    p = buf = PyUnicode_AS_UNICODE(v);
1917    end = s + size;
1918    while (s < end) {
1919	unsigned char c;
1920	Py_UCS4 x;
1921	int i;
1922
1923	/* Non-escape characters are interpreted as Unicode ordinals */
1924	if (*s != '\\') {
1925	    *p++ = (unsigned char)*s++;
1926	    continue;
1927	}
1928
1929	/* \u-escapes are only interpreted iff the number of leading
1930	   backslashes if odd */
1931	bs = s;
1932	for (;s < end;) {
1933	    if (*s != '\\')
1934		break;
1935	    *p++ = (unsigned char)*s++;
1936	}
1937	if (((s - bs) & 1) == 0 ||
1938	    s >= end ||
1939	    *s != 'u') {
1940	    continue;
1941	}
1942	p--;
1943	s++;
1944
1945	/* \uXXXX with 4 hex digits */
1946	for (x = 0, i = 0; i < 4; i++) {
1947	    c = (unsigned char)s[i];
1948	    if (!isxdigit(c)) {
1949		if (unicodeescape_decoding_error(&p, errors,
1950						 "truncated \\uXXXX"))
1951		    goto onError;
1952		x = 0xffffffff;
1953		i++;
1954		break;
1955	    }
1956	    x = (x<<4) & ~0xF;
1957	    if (c >= '0' && c <= '9')
1958		x += c - '0';
1959	    else if (c >= 'a' && c <= 'f')
1960		x += 10 + c - 'a';
1961	    else
1962		x += 10 + c - 'A';
1963	}
1964	s += i;
1965	if (x != 0xffffffff)
1966		*p++ = x;
1967    }
1968    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1969	goto onError;
1970    return (PyObject *)v;
1971
1972 onError:
1973    Py_XDECREF(v);
1974    return NULL;
1975}
1976
1977PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1978					   int size)
1979{
1980    PyObject *repr;
1981    char *p;
1982    char *q;
1983
1984    static const char *hexdigit = "0123456789abcdef";
1985
1986    repr = PyString_FromStringAndSize(NULL, 6 * size);
1987    if (repr == NULL)
1988        return NULL;
1989    if (size == 0)
1990	return repr;
1991
1992    p = q = PyString_AS_STRING(repr);
1993    while (size-- > 0) {
1994        Py_UNICODE ch = *s++;
1995	/* Map 16-bit characters to '\uxxxx' */
1996	if (ch >= 256) {
1997            *p++ = '\\';
1998            *p++ = 'u';
1999            *p++ = hexdigit[(ch >> 12) & 0xf];
2000            *p++ = hexdigit[(ch >> 8) & 0xf];
2001            *p++ = hexdigit[(ch >> 4) & 0xf];
2002            *p++ = hexdigit[ch & 15];
2003        }
2004	/* Copy everything else as-is */
2005	else
2006            *p++ = (char) ch;
2007    }
2008    *p = '\0';
2009    if (_PyString_Resize(&repr, p - q))
2010	goto onError;
2011
2012    return repr;
2013
2014 onError:
2015    Py_DECREF(repr);
2016    return NULL;
2017}
2018
2019PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2020{
2021    if (!PyUnicode_Check(unicode)) {
2022	PyErr_BadArgument();
2023	return NULL;
2024    }
2025    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2026					    PyUnicode_GET_SIZE(unicode));
2027}
2028
2029/* --- Latin-1 Codec ------------------------------------------------------ */
2030
2031PyObject *PyUnicode_DecodeLatin1(const char *s,
2032				 int size,
2033				 const char *errors)
2034{
2035    PyUnicodeObject *v;
2036    Py_UNICODE *p;
2037
2038    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2039    if (size == 1 && *(unsigned char*)s < 256) {
2040	Py_UNICODE r = *(unsigned char*)s;
2041	return PyUnicode_FromUnicode(&r, 1);
2042    }
2043
2044    v = _PyUnicode_New(size);
2045    if (v == NULL)
2046	goto onError;
2047    if (size == 0)
2048	return (PyObject *)v;
2049    p = PyUnicode_AS_UNICODE(v);
2050    while (size-- > 0)
2051	*p++ = (unsigned char)*s++;
2052    return (PyObject *)v;
2053
2054 onError:
2055    Py_XDECREF(v);
2056    return NULL;
2057}
2058
2059static
2060int latin1_encoding_error(const Py_UNICODE **source,
2061			  char **dest,
2062			  const char *errors,
2063			  const char *details)
2064{
2065    if ((errors == NULL) ||
2066	(strcmp(errors,"strict") == 0)) {
2067	PyErr_Format(PyExc_UnicodeError,
2068		     "Latin-1 encoding error: %.400s",
2069		     details);
2070	return -1;
2071    }
2072    else if (strcmp(errors,"ignore") == 0) {
2073	return 0;
2074    }
2075    else if (strcmp(errors,"replace") == 0) {
2076	**dest = '?';
2077	(*dest)++;
2078	return 0;
2079    }
2080    else {
2081	PyErr_Format(PyExc_ValueError,
2082		     "Latin-1 encoding error; "
2083		     "unknown error handling code: %.400s",
2084		     errors);
2085	return -1;
2086    }
2087}
2088
2089PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2090				 int size,
2091				 const char *errors)
2092{
2093    PyObject *repr;
2094    char *s, *start;
2095
2096    repr = PyString_FromStringAndSize(NULL, size);
2097    if (repr == NULL)
2098        return NULL;
2099    if (size == 0)
2100	return repr;
2101
2102    s = PyString_AS_STRING(repr);
2103    start = s;
2104    while (size-- > 0) {
2105        Py_UNICODE ch = *p++;
2106	if (ch >= 256) {
2107	    if (latin1_encoding_error(&p, &s, errors,
2108				      "ordinal not in range(256)"))
2109		goto onError;
2110	}
2111	else
2112            *s++ = (char)ch;
2113    }
2114    /* Resize if error handling skipped some characters */
2115    if (s - start < PyString_GET_SIZE(repr))
2116	if (_PyString_Resize(&repr, s - start))
2117	    goto onError;
2118    return repr;
2119
2120 onError:
2121    Py_DECREF(repr);
2122    return NULL;
2123}
2124
2125PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2126{
2127    if (!PyUnicode_Check(unicode)) {
2128	PyErr_BadArgument();
2129	return NULL;
2130    }
2131    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2132				  PyUnicode_GET_SIZE(unicode),
2133				  NULL);
2134}
2135
2136/* --- 7-bit ASCII Codec -------------------------------------------------- */
2137
2138static
2139int ascii_decoding_error(const char **source,
2140			 Py_UNICODE **dest,
2141			 const char *errors,
2142			 const char *details)
2143{
2144    if ((errors == NULL) ||
2145	(strcmp(errors,"strict") == 0)) {
2146	PyErr_Format(PyExc_UnicodeError,
2147		     "ASCII decoding error: %.400s",
2148		     details);
2149	return -1;
2150    }
2151    else if (strcmp(errors,"ignore") == 0) {
2152	return 0;
2153    }
2154    else if (strcmp(errors,"replace") == 0) {
2155	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2156	(*dest)++;
2157	return 0;
2158    }
2159    else {
2160	PyErr_Format(PyExc_ValueError,
2161		     "ASCII decoding error; "
2162		     "unknown error handling code: %.400s",
2163		     errors);
2164	return -1;
2165    }
2166}
2167
2168PyObject *PyUnicode_DecodeASCII(const char *s,
2169				int size,
2170				const char *errors)
2171{
2172    PyUnicodeObject *v;
2173    Py_UNICODE *p;
2174
2175    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2176    if (size == 1 && *(unsigned char*)s < 128) {
2177	Py_UNICODE r = *(unsigned char*)s;
2178	return PyUnicode_FromUnicode(&r, 1);
2179    }
2180
2181    v = _PyUnicode_New(size);
2182    if (v == NULL)
2183	goto onError;
2184    if (size == 0)
2185	return (PyObject *)v;
2186    p = PyUnicode_AS_UNICODE(v);
2187    while (size-- > 0) {
2188	register unsigned char c;
2189
2190	c = (unsigned char)*s++;
2191	if (c < 128)
2192	    *p++ = c;
2193	else if (ascii_decoding_error(&s, &p, errors,
2194				      "ordinal not in range(128)"))
2195		goto onError;
2196    }
2197    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2198	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2199	    goto onError;
2200    return (PyObject *)v;
2201
2202 onError:
2203    Py_XDECREF(v);
2204    return NULL;
2205}
2206
2207static
2208int ascii_encoding_error(const Py_UNICODE **source,
2209			 char **dest,
2210			 const char *errors,
2211			 const char *details)
2212{
2213    if ((errors == NULL) ||
2214	(strcmp(errors,"strict") == 0)) {
2215	PyErr_Format(PyExc_UnicodeError,
2216		     "ASCII encoding error: %.400s",
2217		     details);
2218	return -1;
2219    }
2220    else if (strcmp(errors,"ignore") == 0) {
2221	return 0;
2222    }
2223    else if (strcmp(errors,"replace") == 0) {
2224	**dest = '?';
2225	(*dest)++;
2226	return 0;
2227    }
2228    else {
2229	PyErr_Format(PyExc_ValueError,
2230		     "ASCII encoding error; "
2231		     "unknown error handling code: %.400s",
2232		     errors);
2233	return -1;
2234    }
2235}
2236
2237PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2238				int size,
2239				const char *errors)
2240{
2241    PyObject *repr;
2242    char *s, *start;
2243
2244    repr = PyString_FromStringAndSize(NULL, size);
2245    if (repr == NULL)
2246        return NULL;
2247    if (size == 0)
2248	return repr;
2249
2250    s = PyString_AS_STRING(repr);
2251    start = s;
2252    while (size-- > 0) {
2253        Py_UNICODE ch = *p++;
2254	if (ch >= 128) {
2255	    if (ascii_encoding_error(&p, &s, errors,
2256				      "ordinal not in range(128)"))
2257		goto onError;
2258	}
2259	else
2260            *s++ = (char)ch;
2261    }
2262    /* Resize if error handling skipped some characters */
2263    if (s - start < PyString_GET_SIZE(repr))
2264	if (_PyString_Resize(&repr, s - start))
2265	    goto onError;
2266    return repr;
2267
2268 onError:
2269    Py_DECREF(repr);
2270    return NULL;
2271}
2272
2273PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2274{
2275    if (!PyUnicode_Check(unicode)) {
2276	PyErr_BadArgument();
2277	return NULL;
2278    }
2279    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2280				 PyUnicode_GET_SIZE(unicode),
2281				 NULL);
2282}
2283
2284#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2285
2286/* --- MBCS codecs for Windows -------------------------------------------- */
2287
2288PyObject *PyUnicode_DecodeMBCS(const char *s,
2289				int size,
2290				const char *errors)
2291{
2292    PyUnicodeObject *v;
2293    Py_UNICODE *p;
2294
2295    /* First get the size of the result */
2296    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2297    if (size > 0 && usize==0)
2298        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2299
2300    v = _PyUnicode_New(usize);
2301    if (v == NULL)
2302        return NULL;
2303    if (usize == 0)
2304	return (PyObject *)v;
2305    p = PyUnicode_AS_UNICODE(v);
2306    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2307        Py_DECREF(v);
2308        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2309    }
2310
2311    return (PyObject *)v;
2312}
2313
2314PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2315				int size,
2316				const char *errors)
2317{
2318    PyObject *repr;
2319    char *s;
2320    DWORD mbcssize;
2321
2322    /* If there are no characters, bail now! */
2323    if (size==0)
2324	    return PyString_FromString("");
2325
2326    /* First get the size of the result */
2327    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2328    if (mbcssize==0)
2329        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2330
2331    repr = PyString_FromStringAndSize(NULL, mbcssize);
2332    if (repr == NULL)
2333        return NULL;
2334    if (mbcssize == 0)
2335        return repr;
2336
2337    /* Do the conversion */
2338    s = PyString_AS_STRING(repr);
2339    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2340        Py_DECREF(repr);
2341        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2342    }
2343    return repr;
2344}
2345
2346#endif /* MS_WIN32 */
2347
2348/* --- Character Mapping Codec -------------------------------------------- */
2349
2350static
2351int charmap_decoding_error(const char **source,
2352			 Py_UNICODE **dest,
2353			 const char *errors,
2354			 const char *details)
2355{
2356    if ((errors == NULL) ||
2357	(strcmp(errors,"strict") == 0)) {
2358	PyErr_Format(PyExc_UnicodeError,
2359		     "charmap decoding error: %.400s",
2360		     details);
2361	return -1;
2362    }
2363    else if (strcmp(errors,"ignore") == 0) {
2364	return 0;
2365    }
2366    else if (strcmp(errors,"replace") == 0) {
2367	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2368	(*dest)++;
2369	return 0;
2370    }
2371    else {
2372	PyErr_Format(PyExc_ValueError,
2373		     "charmap decoding error; "
2374		     "unknown error handling code: %.400s",
2375		     errors);
2376	return -1;
2377    }
2378}
2379
2380PyObject *PyUnicode_DecodeCharmap(const char *s,
2381				  int size,
2382				  PyObject *mapping,
2383				  const char *errors)
2384{
2385    PyUnicodeObject *v;
2386    Py_UNICODE *p;
2387    int extrachars = 0;
2388
2389    /* Default to Latin-1 */
2390    if (mapping == NULL)
2391	return PyUnicode_DecodeLatin1(s, size, errors);
2392
2393    v = _PyUnicode_New(size);
2394    if (v == NULL)
2395	goto onError;
2396    if (size == 0)
2397	return (PyObject *)v;
2398    p = PyUnicode_AS_UNICODE(v);
2399    while (size-- > 0) {
2400	unsigned char ch = *s++;
2401	PyObject *w, *x;
2402
2403	/* Get mapping (char ordinal -> integer, Unicode char or None) */
2404	w = PyInt_FromLong((long)ch);
2405	if (w == NULL)
2406	    goto onError;
2407	x = PyObject_GetItem(mapping, w);
2408	Py_DECREF(w);
2409	if (x == NULL) {
2410	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2411		/* No mapping found means: mapping is undefined. */
2412		PyErr_Clear();
2413		x = Py_None;
2414		Py_INCREF(x);
2415	    } else
2416		goto onError;
2417	}
2418
2419	/* Apply mapping */
2420	if (PyInt_Check(x)) {
2421	    long value = PyInt_AS_LONG(x);
2422	    if (value < 0 || value > 65535) {
2423		PyErr_SetString(PyExc_TypeError,
2424				"character mapping must be in range(65536)");
2425		Py_DECREF(x);
2426		goto onError;
2427	    }
2428	    *p++ = (Py_UNICODE)value;
2429	}
2430	else if (x == Py_None) {
2431	    /* undefined mapping */
2432	    if (charmap_decoding_error(&s, &p, errors,
2433				       "character maps to <undefined>")) {
2434		Py_DECREF(x);
2435		goto onError;
2436	    }
2437	}
2438	else if (PyUnicode_Check(x)) {
2439	    int targetsize = PyUnicode_GET_SIZE(x);
2440
2441	    if (targetsize == 1)
2442		/* 1-1 mapping */
2443		*p++ = *PyUnicode_AS_UNICODE(x);
2444
2445	    else if (targetsize > 1) {
2446		/* 1-n mapping */
2447		if (targetsize > extrachars) {
2448		    /* resize first */
2449		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2450		    int needed = (targetsize - extrachars) + \
2451			         (targetsize << 2);
2452		    extrachars += needed;
2453		    if (_PyUnicode_Resize(&v,
2454					 PyUnicode_GET_SIZE(v) + needed)) {
2455			Py_DECREF(x);
2456			goto onError;
2457		    }
2458		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2459		}
2460		Py_UNICODE_COPY(p,
2461				PyUnicode_AS_UNICODE(x),
2462				targetsize);
2463		p += targetsize;
2464		extrachars -= targetsize;
2465	    }
2466	    /* 1-0 mapping: skip the character */
2467	}
2468	else {
2469	    /* wrong return value */
2470	    PyErr_SetString(PyExc_TypeError,
2471		  "character mapping must return integer, None or unicode");
2472	    Py_DECREF(x);
2473	    goto onError;
2474	}
2475	Py_DECREF(x);
2476    }
2477    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2478	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2479	    goto onError;
2480    return (PyObject *)v;
2481
2482 onError:
2483    Py_XDECREF(v);
2484    return NULL;
2485}
2486
2487static
2488int charmap_encoding_error(const Py_UNICODE **source,
2489			   char **dest,
2490			   const char *errors,
2491			   const char *details)
2492{
2493    if ((errors == NULL) ||
2494	(strcmp(errors,"strict") == 0)) {
2495	PyErr_Format(PyExc_UnicodeError,
2496		     "charmap encoding error: %.400s",
2497		     details);
2498	return -1;
2499    }
2500    else if (strcmp(errors,"ignore") == 0) {
2501	return 0;
2502    }
2503    else if (strcmp(errors,"replace") == 0) {
2504	**dest = '?';
2505	(*dest)++;
2506	return 0;
2507    }
2508    else {
2509	PyErr_Format(PyExc_ValueError,
2510		     "charmap encoding error; "
2511		     "unknown error handling code: %.400s",
2512		     errors);
2513	return -1;
2514    }
2515}
2516
2517PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2518				  int size,
2519				  PyObject *mapping,
2520				  const char *errors)
2521{
2522    PyObject *v;
2523    char *s;
2524    int extrachars = 0;
2525
2526    /* Default to Latin-1 */
2527    if (mapping == NULL)
2528	return PyUnicode_EncodeLatin1(p, size, errors);
2529
2530    v = PyString_FromStringAndSize(NULL, size);
2531    if (v == NULL)
2532        return NULL;
2533    if (size == 0)
2534	return v;
2535    s = PyString_AS_STRING(v);
2536    while (size-- > 0) {
2537	Py_UNICODE ch = *p++;
2538	PyObject *w, *x;
2539
2540	/* Get mapping (Unicode ordinal -> string char, integer or None) */
2541	w = PyInt_FromLong((long)ch);
2542	if (w == NULL)
2543	    goto onError;
2544	x = PyObject_GetItem(mapping, w);
2545	Py_DECREF(w);
2546	if (x == NULL) {
2547	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2548		/* No mapping found means: mapping is undefined. */
2549		PyErr_Clear();
2550		x = Py_None;
2551		Py_INCREF(x);
2552	    } else
2553		goto onError;
2554	}
2555
2556	/* Apply mapping */
2557	if (PyInt_Check(x)) {
2558	    long value = PyInt_AS_LONG(x);
2559	    if (value < 0 || value > 255) {
2560		PyErr_SetString(PyExc_TypeError,
2561				"character mapping must be in range(256)");
2562		Py_DECREF(x);
2563		goto onError;
2564	    }
2565	    *s++ = (char)value;
2566	}
2567	else if (x == Py_None) {
2568	    /* undefined mapping */
2569	    if (charmap_encoding_error(&p, &s, errors,
2570				       "character maps to <undefined>")) {
2571		Py_DECREF(x);
2572		goto onError;
2573	    }
2574	}
2575	else if (PyString_Check(x)) {
2576	    int targetsize = PyString_GET_SIZE(x);
2577
2578	    if (targetsize == 1)
2579		/* 1-1 mapping */
2580		*s++ = *PyString_AS_STRING(x);
2581
2582	    else if (targetsize > 1) {
2583		/* 1-n mapping */
2584		if (targetsize > extrachars) {
2585		    /* resize first */
2586		    int oldpos = (int)(s - PyString_AS_STRING(v));
2587		    int needed = (targetsize - extrachars) + \
2588			         (targetsize << 2);
2589		    extrachars += needed;
2590		    if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2591			Py_DECREF(x);
2592			goto onError;
2593		    }
2594		    s = PyString_AS_STRING(v) + oldpos;
2595		}
2596		memcpy(s, PyString_AS_STRING(x), targetsize);
2597		s += targetsize;
2598		extrachars -= targetsize;
2599	    }
2600	    /* 1-0 mapping: skip the character */
2601	}
2602	else {
2603	    /* wrong return value */
2604	    PyErr_SetString(PyExc_TypeError,
2605		  "character mapping must return integer, None or unicode");
2606	    Py_DECREF(x);
2607	    goto onError;
2608	}
2609	Py_DECREF(x);
2610    }
2611    if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2612	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2613	    goto onError;
2614    return v;
2615
2616 onError:
2617    Py_DECREF(v);
2618    return NULL;
2619}
2620
2621PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2622				    PyObject *mapping)
2623{
2624    if (!PyUnicode_Check(unicode) || mapping == NULL) {
2625	PyErr_BadArgument();
2626	return NULL;
2627    }
2628    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2629				   PyUnicode_GET_SIZE(unicode),
2630				   mapping,
2631				   NULL);
2632}
2633
2634static
2635int translate_error(const Py_UNICODE **source,
2636		    Py_UNICODE **dest,
2637		    const char *errors,
2638		    const char *details)
2639{
2640    if ((errors == NULL) ||
2641	(strcmp(errors,"strict") == 0)) {
2642	PyErr_Format(PyExc_UnicodeError,
2643		     "translate error: %.400s",
2644		     details);
2645	return -1;
2646    }
2647    else if (strcmp(errors,"ignore") == 0) {
2648	return 0;
2649    }
2650    else if (strcmp(errors,"replace") == 0) {
2651	**dest = '?';
2652	(*dest)++;
2653	return 0;
2654    }
2655    else {
2656	PyErr_Format(PyExc_ValueError,
2657		     "translate error; "
2658		     "unknown error handling code: %.400s",
2659		     errors);
2660	return -1;
2661    }
2662}
2663
2664PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2665				     int size,
2666				     PyObject *mapping,
2667				     const char *errors)
2668{
2669    PyUnicodeObject *v;
2670    Py_UNICODE *p;
2671
2672    if (mapping == NULL) {
2673	PyErr_BadArgument();
2674	return NULL;
2675    }
2676
2677    /* Output will never be longer than input */
2678    v = _PyUnicode_New(size);
2679    if (v == NULL)
2680	goto onError;
2681    if (size == 0)
2682	goto done;
2683    p = PyUnicode_AS_UNICODE(v);
2684    while (size-- > 0) {
2685	Py_UNICODE ch = *s++;
2686	PyObject *w, *x;
2687
2688	/* Get mapping */
2689	w = PyInt_FromLong(ch);
2690	if (w == NULL)
2691	    goto onError;
2692	x = PyObject_GetItem(mapping, w);
2693	Py_DECREF(w);
2694	if (x == NULL) {
2695	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2696		/* No mapping found: default to 1-1 mapping */
2697		PyErr_Clear();
2698		*p++ = ch;
2699		continue;
2700	    }
2701	    goto onError;
2702	}
2703
2704	/* Apply mapping */
2705	if (PyInt_Check(x))
2706	    *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2707	else if (x == Py_None) {
2708	    /* undefined mapping */
2709	    if (translate_error(&s, &p, errors,
2710				"character maps to <undefined>")) {
2711		Py_DECREF(x);
2712		goto onError;
2713	    }
2714	}
2715	else if (PyUnicode_Check(x)) {
2716	    if (PyUnicode_GET_SIZE(x) != 1) {
2717		/* 1-n mapping */
2718		PyErr_SetString(PyExc_NotImplementedError,
2719				"1-n mappings are currently not implemented");
2720		Py_DECREF(x);
2721		goto onError;
2722	    }
2723	    *p++ = *PyUnicode_AS_UNICODE(x);
2724	}
2725	else {
2726	    /* wrong return value */
2727	    PyErr_SetString(PyExc_TypeError,
2728		  "translate mapping must return integer, None or unicode");
2729	    Py_DECREF(x);
2730	    goto onError;
2731	}
2732	Py_DECREF(x);
2733    }
2734    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2735	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2736	    goto onError;
2737
2738 done:
2739    return (PyObject *)v;
2740
2741 onError:
2742    Py_XDECREF(v);
2743    return NULL;
2744}
2745
2746PyObject *PyUnicode_Translate(PyObject *str,
2747			      PyObject *mapping,
2748			      const char *errors)
2749{
2750    PyObject *result;
2751
2752    str = PyUnicode_FromObject(str);
2753    if (str == NULL)
2754	goto onError;
2755    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2756					PyUnicode_GET_SIZE(str),
2757					mapping,
2758					errors);
2759    Py_DECREF(str);
2760    return result;
2761
2762 onError:
2763    Py_XDECREF(str);
2764    return NULL;
2765}
2766
2767/* --- Decimal Encoder ---------------------------------------------------- */
2768
2769int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2770			    int length,
2771			    char *output,
2772			    const char *errors)
2773{
2774    Py_UNICODE *p, *end;
2775
2776    if (output == NULL) {
2777	PyErr_BadArgument();
2778	return -1;
2779    }
2780
2781    p = s;
2782    end = s + length;
2783    while (p < end) {
2784	register Py_UNICODE ch = *p++;
2785	int decimal;
2786
2787	if (Py_UNICODE_ISSPACE(ch)) {
2788	    *output++ = ' ';
2789	    continue;
2790	}
2791	decimal = Py_UNICODE_TODECIMAL(ch);
2792	if (decimal >= 0) {
2793	    *output++ = '0' + decimal;
2794	    continue;
2795	}
2796	if (0 < ch && ch < 256) {
2797	    *output++ = (char)ch;
2798	    continue;
2799	}
2800	/* All other characters are considered invalid */
2801	if (errors == NULL || strcmp(errors, "strict") == 0) {
2802	    PyErr_SetString(PyExc_ValueError,
2803			    "invalid decimal Unicode string");
2804	    goto onError;
2805	}
2806	else if (strcmp(errors, "ignore") == 0)
2807	    continue;
2808	else if (strcmp(errors, "replace") == 0) {
2809	    *output++ = '?';
2810	    continue;
2811	}
2812    }
2813    /* 0-terminate the output string */
2814    *output++ = '\0';
2815    return 0;
2816
2817 onError:
2818    return -1;
2819}
2820
2821/* --- Helpers ------------------------------------------------------------ */
2822
2823static
2824int count(PyUnicodeObject *self,
2825	  int start,
2826	  int end,
2827	  PyUnicodeObject *substring)
2828{
2829    int count = 0;
2830
2831    if (start < 0)
2832        start += self->length;
2833    if (start < 0)
2834        start = 0;
2835    if (end > self->length)
2836        end = self->length;
2837    if (end < 0)
2838        end += self->length;
2839    if (end < 0)
2840        end = 0;
2841
2842    if (substring->length == 0)
2843	return (end - start + 1);
2844
2845    end -= substring->length;
2846
2847    while (start <= end)
2848        if (Py_UNICODE_MATCH(self, start, substring)) {
2849            count++;
2850            start += substring->length;
2851        } else
2852            start++;
2853
2854    return count;
2855}
2856
2857int PyUnicode_Count(PyObject *str,
2858		    PyObject *substr,
2859		    int start,
2860		    int end)
2861{
2862    int result;
2863
2864    str = PyUnicode_FromObject(str);
2865    if (str == NULL)
2866	return -1;
2867    substr = PyUnicode_FromObject(substr);
2868    if (substr == NULL) {
2869	Py_DECREF(str);
2870	return -1;
2871    }
2872
2873    result = count((PyUnicodeObject *)str,
2874		   start, end,
2875		   (PyUnicodeObject *)substr);
2876
2877    Py_DECREF(str);
2878    Py_DECREF(substr);
2879    return result;
2880}
2881
2882static
2883int findstring(PyUnicodeObject *self,
2884	       PyUnicodeObject *substring,
2885	       int start,
2886	       int end,
2887	       int direction)
2888{
2889    if (start < 0)
2890        start += self->length;
2891    if (start < 0)
2892        start = 0;
2893
2894    if (substring->length == 0)
2895        return start;
2896
2897    if (end > self->length)
2898        end = self->length;
2899    if (end < 0)
2900        end += self->length;
2901    if (end < 0)
2902        end = 0;
2903
2904    end -= substring->length;
2905
2906    if (direction < 0) {
2907        for (; end >= start; end--)
2908            if (Py_UNICODE_MATCH(self, end, substring))
2909                return end;
2910    } else {
2911        for (; start <= end; start++)
2912            if (Py_UNICODE_MATCH(self, start, substring))
2913                return start;
2914    }
2915
2916    return -1;
2917}
2918
2919int PyUnicode_Find(PyObject *str,
2920		   PyObject *substr,
2921		   int start,
2922		   int end,
2923		   int direction)
2924{
2925    int result;
2926
2927    str = PyUnicode_FromObject(str);
2928    if (str == NULL)
2929	return -1;
2930    substr = PyUnicode_FromObject(substr);
2931    if (substr == NULL) {
2932	Py_DECREF(substr);
2933	return -1;
2934    }
2935
2936    result = findstring((PyUnicodeObject *)str,
2937			(PyUnicodeObject *)substr,
2938			start, end, direction);
2939    Py_DECREF(str);
2940    Py_DECREF(substr);
2941    return result;
2942}
2943
2944static
2945int tailmatch(PyUnicodeObject *self,
2946	      PyUnicodeObject *substring,
2947	      int start,
2948	      int end,
2949	      int direction)
2950{
2951    if (start < 0)
2952        start += self->length;
2953    if (start < 0)
2954        start = 0;
2955
2956    if (substring->length == 0)
2957        return 1;
2958
2959    if (end > self->length)
2960        end = self->length;
2961    if (end < 0)
2962        end += self->length;
2963    if (end < 0)
2964        end = 0;
2965
2966    end -= substring->length;
2967    if (end < start)
2968	return 0;
2969
2970    if (direction > 0) {
2971	if (Py_UNICODE_MATCH(self, end, substring))
2972	    return 1;
2973    } else {
2974        if (Py_UNICODE_MATCH(self, start, substring))
2975	    return 1;
2976    }
2977
2978    return 0;
2979}
2980
2981int PyUnicode_Tailmatch(PyObject *str,
2982			PyObject *substr,
2983			int start,
2984			int end,
2985			int direction)
2986{
2987    int result;
2988
2989    str = PyUnicode_FromObject(str);
2990    if (str == NULL)
2991	return -1;
2992    substr = PyUnicode_FromObject(substr);
2993    if (substr == NULL) {
2994	Py_DECREF(substr);
2995	return -1;
2996    }
2997
2998    result = tailmatch((PyUnicodeObject *)str,
2999		       (PyUnicodeObject *)substr,
3000		       start, end, direction);
3001    Py_DECREF(str);
3002    Py_DECREF(substr);
3003    return result;
3004}
3005
3006static
3007const Py_UNICODE *findchar(const Py_UNICODE *s,
3008		     int size,
3009		     Py_UNICODE ch)
3010{
3011    /* like wcschr, but doesn't stop at NULL characters */
3012
3013    while (size-- > 0) {
3014        if (*s == ch)
3015            return s;
3016        s++;
3017    }
3018
3019    return NULL;
3020}
3021
3022/* Apply fixfct filter to the Unicode object self and return a
3023   reference to the modified object */
3024
3025static
3026PyObject *fixup(PyUnicodeObject *self,
3027		int (*fixfct)(PyUnicodeObject *s))
3028{
3029
3030    PyUnicodeObject *u;
3031
3032    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3033    if (u == NULL)
3034	return NULL;
3035
3036    Py_UNICODE_COPY(u->str, self->str, self->length);
3037
3038    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3039	/* fixfct should return TRUE if it modified the buffer. If
3040	   FALSE, return a reference to the original buffer instead
3041	   (to save space, not time) */
3042	Py_INCREF(self);
3043	Py_DECREF(u);
3044	return (PyObject*) self;
3045    }
3046    return (PyObject*) u;
3047}
3048
3049static
3050int fixupper(PyUnicodeObject *self)
3051{
3052    int len = self->length;
3053    Py_UNICODE *s = self->str;
3054    int status = 0;
3055
3056    while (len-- > 0) {
3057	register Py_UNICODE ch;
3058
3059	ch = Py_UNICODE_TOUPPER(*s);
3060	if (ch != *s) {
3061            status = 1;
3062	    *s = ch;
3063	}
3064        s++;
3065    }
3066
3067    return status;
3068}
3069
3070static
3071int fixlower(PyUnicodeObject *self)
3072{
3073    int len = self->length;
3074    Py_UNICODE *s = self->str;
3075    int status = 0;
3076
3077    while (len-- > 0) {
3078	register Py_UNICODE ch;
3079
3080	ch = Py_UNICODE_TOLOWER(*s);
3081	if (ch != *s) {
3082            status = 1;
3083	    *s = ch;
3084	}
3085        s++;
3086    }
3087
3088    return status;
3089}
3090
3091static
3092int fixswapcase(PyUnicodeObject *self)
3093{
3094    int len = self->length;
3095    Py_UNICODE *s = self->str;
3096    int status = 0;
3097
3098    while (len-- > 0) {
3099        if (Py_UNICODE_ISUPPER(*s)) {
3100            *s = Py_UNICODE_TOLOWER(*s);
3101            status = 1;
3102        } else if (Py_UNICODE_ISLOWER(*s)) {
3103            *s = Py_UNICODE_TOUPPER(*s);
3104            status = 1;
3105        }
3106        s++;
3107    }
3108
3109    return status;
3110}
3111
3112static
3113int fixcapitalize(PyUnicodeObject *self)
3114{
3115    int len = self->length;
3116    Py_UNICODE *s = self->str;
3117    int status = 0;
3118
3119    if (len == 0)
3120	return 0;
3121    if (Py_UNICODE_ISLOWER(*s)) {
3122	*s = Py_UNICODE_TOUPPER(*s);
3123	status = 1;
3124    }
3125    s++;
3126    while (--len > 0) {
3127        if (Py_UNICODE_ISUPPER(*s)) {
3128            *s = Py_UNICODE_TOLOWER(*s);
3129            status = 1;
3130        }
3131        s++;
3132    }
3133    return status;
3134}
3135
3136static
3137int fixtitle(PyUnicodeObject *self)
3138{
3139    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3140    register Py_UNICODE *e;
3141    int previous_is_cased;
3142
3143    /* Shortcut for single character strings */
3144    if (PyUnicode_GET_SIZE(self) == 1) {
3145	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3146	if (*p != ch) {
3147	    *p = ch;
3148	    return 1;
3149	}
3150	else
3151	    return 0;
3152    }
3153
3154    e = p + PyUnicode_GET_SIZE(self);
3155    previous_is_cased = 0;
3156    for (; p < e; p++) {
3157	register const Py_UNICODE ch = *p;
3158
3159	if (previous_is_cased)
3160	    *p = Py_UNICODE_TOLOWER(ch);
3161	else
3162	    *p = Py_UNICODE_TOTITLE(ch);
3163
3164	if (Py_UNICODE_ISLOWER(ch) ||
3165	    Py_UNICODE_ISUPPER(ch) ||
3166	    Py_UNICODE_ISTITLE(ch))
3167	    previous_is_cased = 1;
3168	else
3169	    previous_is_cased = 0;
3170    }
3171    return 1;
3172}
3173
3174PyObject *PyUnicode_Join(PyObject *separator,
3175			 PyObject *seq)
3176{
3177    Py_UNICODE *sep;
3178    int seplen;
3179    PyUnicodeObject *res = NULL;
3180    int reslen = 0;
3181    Py_UNICODE *p;
3182    int sz = 100;
3183    int i;
3184    PyObject *it;
3185
3186    it = PyObject_GetIter(seq);
3187    if (it == NULL)
3188        return NULL;
3189
3190    if (separator == NULL) {
3191	Py_UNICODE blank = ' ';
3192	sep = &blank;
3193	seplen = 1;
3194    }
3195    else {
3196	separator = PyUnicode_FromObject(separator);
3197	if (separator == NULL)
3198	    goto onError;
3199	sep = PyUnicode_AS_UNICODE(separator);
3200	seplen = PyUnicode_GET_SIZE(separator);
3201    }
3202
3203    res = _PyUnicode_New(sz);
3204    if (res == NULL)
3205	goto onError;
3206    p = PyUnicode_AS_UNICODE(res);
3207    reslen = 0;
3208
3209    for (i = 0; ; ++i) {
3210	int itemlen;
3211	PyObject *item = PyIter_Next(it);
3212	if (item == NULL) {
3213	    if (PyErr_Occurred())
3214		goto onError;
3215	    break;
3216	}
3217	if (!PyUnicode_Check(item)) {
3218	    PyObject *v;
3219	    if (!PyString_Check(item)) {
3220		PyErr_Format(PyExc_TypeError,
3221			     "sequence item %i: expected string or Unicode,"
3222			     " %.80s found",
3223			     i, item->ob_type->tp_name);
3224		Py_DECREF(item);
3225		goto onError;
3226	    }
3227	    v = PyUnicode_FromObject(item);
3228	    Py_DECREF(item);
3229	    item = v;
3230	    if (item == NULL)
3231		goto onError;
3232	}
3233	itemlen = PyUnicode_GET_SIZE(item);
3234	while (reslen + itemlen + seplen >= sz) {
3235	    if (_PyUnicode_Resize(&res, sz*2)) {
3236		Py_DECREF(item);
3237		goto onError;
3238	    }
3239	    sz *= 2;
3240	    p = PyUnicode_AS_UNICODE(res) + reslen;
3241	}
3242	if (i > 0) {
3243	    Py_UNICODE_COPY(p, sep, seplen);
3244	    p += seplen;
3245	    reslen += seplen;
3246	}
3247	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3248	p += itemlen;
3249	reslen += itemlen;
3250	Py_DECREF(item);
3251    }
3252    if (_PyUnicode_Resize(&res, reslen))
3253	goto onError;
3254
3255    Py_XDECREF(separator);
3256    Py_DECREF(it);
3257    return (PyObject *)res;
3258
3259 onError:
3260    Py_XDECREF(separator);
3261    Py_XDECREF(res);
3262    Py_DECREF(it);
3263    return NULL;
3264}
3265
3266static
3267PyUnicodeObject *pad(PyUnicodeObject *self,
3268		     int left,
3269		     int right,
3270		     Py_UNICODE fill)
3271{
3272    PyUnicodeObject *u;
3273
3274    if (left < 0)
3275        left = 0;
3276    if (right < 0)
3277        right = 0;
3278
3279    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3280        Py_INCREF(self);
3281        return self;
3282    }
3283
3284    u = _PyUnicode_New(left + self->length + right);
3285    if (u) {
3286        if (left)
3287            Py_UNICODE_FILL(u->str, fill, left);
3288        Py_UNICODE_COPY(u->str + left, self->str, self->length);
3289        if (right)
3290            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3291    }
3292
3293    return u;
3294}
3295
3296#define SPLIT_APPEND(data, left, right)					\
3297	str = PyUnicode_FromUnicode(data + left, right - left);		\
3298	if (!str)							\
3299	    goto onError;						\
3300	if (PyList_Append(list, str)) {					\
3301	    Py_DECREF(str);						\
3302	    goto onError;						\
3303	}								\
3304        else								\
3305            Py_DECREF(str);
3306
3307static
3308PyObject *split_whitespace(PyUnicodeObject *self,
3309			   PyObject *list,
3310			   int maxcount)
3311{
3312    register int i;
3313    register int j;
3314    int len = self->length;
3315    PyObject *str;
3316
3317    for (i = j = 0; i < len; ) {
3318	/* find a token */
3319	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3320	    i++;
3321	j = i;
3322	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3323	    i++;
3324	if (j < i) {
3325	    if (maxcount-- <= 0)
3326		break;
3327	    SPLIT_APPEND(self->str, j, i);
3328	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3329		i++;
3330	    j = i;
3331	}
3332    }
3333    if (j < len) {
3334	SPLIT_APPEND(self->str, j, len);
3335    }
3336    return list;
3337
3338 onError:
3339    Py_DECREF(list);
3340    return NULL;
3341}
3342
3343PyObject *PyUnicode_Splitlines(PyObject *string,
3344			       int keepends)
3345{
3346    register int i;
3347    register int j;
3348    int len;
3349    PyObject *list;
3350    PyObject *str;
3351    Py_UNICODE *data;
3352
3353    string = PyUnicode_FromObject(string);
3354    if (string == NULL)
3355	return NULL;
3356    data = PyUnicode_AS_UNICODE(string);
3357    len = PyUnicode_GET_SIZE(string);
3358
3359    list = PyList_New(0);
3360    if (!list)
3361        goto onError;
3362
3363    for (i = j = 0; i < len; ) {
3364	int eol;
3365
3366	/* Find a line and append it */
3367	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3368	    i++;
3369
3370	/* Skip the line break reading CRLF as one line break */
3371	eol = i;
3372	if (i < len) {
3373	    if (data[i] == '\r' && i + 1 < len &&
3374		data[i+1] == '\n')
3375		i += 2;
3376	    else
3377		i++;
3378	    if (keepends)
3379		eol = i;
3380	}
3381	SPLIT_APPEND(data, j, eol);
3382	j = i;
3383    }
3384    if (j < len) {
3385	SPLIT_APPEND(data, j, len);
3386    }
3387
3388    Py_DECREF(string);
3389    return list;
3390
3391 onError:
3392    Py_DECREF(list);
3393    Py_DECREF(string);
3394    return NULL;
3395}
3396
3397static
3398PyObject *split_char(PyUnicodeObject *self,
3399		     PyObject *list,
3400		     Py_UNICODE ch,
3401		     int maxcount)
3402{
3403    register int i;
3404    register int j;
3405    int len = self->length;
3406    PyObject *str;
3407
3408    for (i = j = 0; i < len; ) {
3409	if (self->str[i] == ch) {
3410	    if (maxcount-- <= 0)
3411		break;
3412	    SPLIT_APPEND(self->str, j, i);
3413	    i = j = i + 1;
3414	} else
3415	    i++;
3416    }
3417    if (j <= len) {
3418	SPLIT_APPEND(self->str, j, len);
3419    }
3420    return list;
3421
3422 onError:
3423    Py_DECREF(list);
3424    return NULL;
3425}
3426
3427static
3428PyObject *split_substring(PyUnicodeObject *self,
3429			  PyObject *list,
3430			  PyUnicodeObject *substring,
3431			  int maxcount)
3432{
3433    register int i;
3434    register int j;
3435    int len = self->length;
3436    int sublen = substring->length;
3437    PyObject *str;
3438
3439    for (i = j = 0; i <= len - sublen; ) {
3440	if (Py_UNICODE_MATCH(self, i, substring)) {
3441	    if (maxcount-- <= 0)
3442		break;
3443	    SPLIT_APPEND(self->str, j, i);
3444	    i = j = i + sublen;
3445	} else
3446	    i++;
3447    }
3448    if (j <= len) {
3449	SPLIT_APPEND(self->str, j, len);
3450    }
3451    return list;
3452
3453 onError:
3454    Py_DECREF(list);
3455    return NULL;
3456}
3457
3458#undef SPLIT_APPEND
3459
3460static
3461PyObject *split(PyUnicodeObject *self,
3462		PyUnicodeObject *substring,
3463		int maxcount)
3464{
3465    PyObject *list;
3466
3467    if (maxcount < 0)
3468        maxcount = INT_MAX;
3469
3470    list = PyList_New(0);
3471    if (!list)
3472        return NULL;
3473
3474    if (substring == NULL)
3475	return split_whitespace(self,list,maxcount);
3476
3477    else if (substring->length == 1)
3478	return split_char(self,list,substring->str[0],maxcount);
3479
3480    else if (substring->length == 0) {
3481	Py_DECREF(list);
3482	PyErr_SetString(PyExc_ValueError, "empty separator");
3483	return NULL;
3484    }
3485    else
3486	return split_substring(self,list,substring,maxcount);
3487}
3488
3489static
3490PyObject *strip(PyUnicodeObject *self,
3491		int left,
3492		int right)
3493{
3494    Py_UNICODE *p = self->str;
3495    int start = 0;
3496    int end = self->length;
3497
3498    if (left)
3499        while (start < end && Py_UNICODE_ISSPACE(p[start]))
3500            start++;
3501
3502    if (right)
3503        while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3504            end--;
3505
3506    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
3507        /* couldn't strip anything off, return original string */
3508        Py_INCREF(self);
3509        return (PyObject*) self;
3510    }
3511
3512    return (PyObject*) PyUnicode_FromUnicode(
3513        self->str + start,
3514        end - start
3515        );
3516}
3517
3518static
3519PyObject *replace(PyUnicodeObject *self,
3520		  PyUnicodeObject *str1,
3521		  PyUnicodeObject *str2,
3522		  int maxcount)
3523{
3524    PyUnicodeObject *u;
3525
3526    if (maxcount < 0)
3527	maxcount = INT_MAX;
3528
3529    if (str1->length == 1 && str2->length == 1) {
3530        int i;
3531
3532        /* replace characters */
3533        if (!findchar(self->str, self->length, str1->str[0]) &&
3534            PyUnicode_CheckExact(self)) {
3535            /* nothing to replace, return original string */
3536            Py_INCREF(self);
3537            u = self;
3538        } else {
3539	    Py_UNICODE u1 = str1->str[0];
3540	    Py_UNICODE u2 = str2->str[0];
3541
3542            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3543                NULL,
3544                self->length
3545                );
3546            if (u != NULL) {
3547		Py_UNICODE_COPY(u->str, self->str,
3548				self->length);
3549                for (i = 0; i < u->length; i++)
3550                    if (u->str[i] == u1) {
3551                        if (--maxcount < 0)
3552                            break;
3553                        u->str[i] = u2;
3554                    }
3555        }
3556        }
3557
3558    } else {
3559        int n, i;
3560        Py_UNICODE *p;
3561
3562        /* replace strings */
3563        n = count(self, 0, self->length, str1);
3564        if (n > maxcount)
3565            n = maxcount;
3566        if (n == 0 && PyUnicode_CheckExact(self)) {
3567            /* nothing to replace, return original string */
3568            Py_INCREF(self);
3569            u = self;
3570        } else {
3571            u = _PyUnicode_New(
3572                self->length + n * (str2->length - str1->length));
3573            if (u) {
3574                i = 0;
3575                p = u->str;
3576                while (i <= self->length - str1->length)
3577                    if (Py_UNICODE_MATCH(self, i, str1)) {
3578                        /* replace string segment */
3579                        Py_UNICODE_COPY(p, str2->str, str2->length);
3580                        p += str2->length;
3581                        i += str1->length;
3582                        if (--n <= 0) {
3583                            /* copy remaining part */
3584                            Py_UNICODE_COPY(p, self->str+i, self->length-i);
3585                            break;
3586                        }
3587                    } else
3588                        *p++ = self->str[i++];
3589            }
3590        }
3591    }
3592
3593    return (PyObject *) u;
3594}
3595
3596/* --- Unicode Object Methods --------------------------------------------- */
3597
3598static char title__doc__[] =
3599"S.title() -> unicode\n\
3600\n\
3601Return a titlecased version of S, i.e. words start with title case\n\
3602characters, all remaining cased characters have lower case.";
3603
3604static PyObject*
3605unicode_title(PyUnicodeObject *self)
3606{
3607    return fixup(self, fixtitle);
3608}
3609
3610static char capitalize__doc__[] =
3611"S.capitalize() -> unicode\n\
3612\n\
3613Return a capitalized version of S, i.e. make the first character\n\
3614have upper case.";
3615
3616static PyObject*
3617unicode_capitalize(PyUnicodeObject *self)
3618{
3619    return fixup(self, fixcapitalize);
3620}
3621
3622#if 0
3623static char capwords__doc__[] =
3624"S.capwords() -> unicode\n\
3625\n\
3626Apply .capitalize() to all words in S and return the result with\n\
3627normalized whitespace (all whitespace strings are replaced by ' ').";
3628
3629static PyObject*
3630unicode_capwords(PyUnicodeObject *self)
3631{
3632    PyObject *list;
3633    PyObject *item;
3634    int i;
3635
3636    /* Split into words */
3637    list = split(self, NULL, -1);
3638    if (!list)
3639        return NULL;
3640
3641    /* Capitalize each word */
3642    for (i = 0; i < PyList_GET_SIZE(list); i++) {
3643        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3644		     fixcapitalize);
3645        if (item == NULL)
3646            goto onError;
3647        Py_DECREF(PyList_GET_ITEM(list, i));
3648        PyList_SET_ITEM(list, i, item);
3649    }
3650
3651    /* Join the words to form a new string */
3652    item = PyUnicode_Join(NULL, list);
3653
3654onError:
3655    Py_DECREF(list);
3656    return (PyObject *)item;
3657}
3658#endif
3659
3660static char center__doc__[] =
3661"S.center(width) -> unicode\n\
3662\n\
3663Return S centered in a Unicode string of length width. Padding is done\n\
3664using spaces.";
3665
3666static PyObject *
3667unicode_center(PyUnicodeObject *self, PyObject *args)
3668{
3669    int marg, left;
3670    int width;
3671
3672    if (!PyArg_ParseTuple(args, "i:center", &width))
3673        return NULL;
3674
3675    if (self->length >= width && PyUnicode_CheckExact(self)) {
3676        Py_INCREF(self);
3677        return (PyObject*) self;
3678    }
3679
3680    marg = width - self->length;
3681    left = marg / 2 + (marg & width & 1);
3682
3683    return (PyObject*) pad(self, left, marg - left, ' ');
3684}
3685
3686#if 0
3687
3688/* This code should go into some future Unicode collation support
3689   module. The basic comparison should compare ordinals on a naive
3690   basis (this is what Java does and thus JPython too). */
3691
3692/* speedy UTF-16 code point order comparison */
3693/* gleaned from: */
3694/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3695
3696static short utf16Fixup[32] =
3697{
3698    0, 0, 0, 0, 0, 0, 0, 0,
3699    0, 0, 0, 0, 0, 0, 0, 0,
3700    0, 0, 0, 0, 0, 0, 0, 0,
3701    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3702};
3703
3704static int
3705unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3706{
3707    int len1, len2;
3708
3709    Py_UNICODE *s1 = str1->str;
3710    Py_UNICODE *s2 = str2->str;
3711
3712    len1 = str1->length;
3713    len2 = str2->length;
3714
3715    while (len1 > 0 && len2 > 0) {
3716        Py_UNICODE c1, c2;
3717
3718        c1 = *s1++;
3719        c2 = *s2++;
3720
3721	if (c1 > (1<<11) * 26)
3722	    c1 += utf16Fixup[c1>>11];
3723	if (c2 > (1<<11) * 26)
3724            c2 += utf16Fixup[c2>>11];
3725        /* now c1 and c2 are in UTF-32-compatible order */
3726
3727        if (c1 != c2)
3728            return (c1 < c2) ? -1 : 1;
3729
3730        len1--; len2--;
3731    }
3732
3733    return (len1 < len2) ? -1 : (len1 != len2);
3734}
3735
3736#else
3737
3738static int
3739unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3740{
3741    register int len1, len2;
3742
3743    Py_UNICODE *s1 = str1->str;
3744    Py_UNICODE *s2 = str2->str;
3745
3746    len1 = str1->length;
3747    len2 = str2->length;
3748
3749    while (len1 > 0 && len2 > 0) {
3750        Py_UNICODE c1, c2;
3751
3752        c1 = *s1++;
3753        c2 = *s2++;
3754
3755        if (c1 != c2)
3756            return (c1 < c2) ? -1 : 1;
3757
3758        len1--; len2--;
3759    }
3760
3761    return (len1 < len2) ? -1 : (len1 != len2);
3762}
3763
3764#endif
3765
3766int PyUnicode_Compare(PyObject *left,
3767		      PyObject *right)
3768{
3769    PyUnicodeObject *u = NULL, *v = NULL;
3770    int result;
3771
3772    /* Coerce the two arguments */
3773    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3774    if (u == NULL)
3775	goto onError;
3776    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3777    if (v == NULL)
3778	goto onError;
3779
3780    /* Shortcut for empty or interned objects */
3781    if (v == u) {
3782	Py_DECREF(u);
3783	Py_DECREF(v);
3784	return 0;
3785    }
3786
3787    result = unicode_compare(u, v);
3788
3789    Py_DECREF(u);
3790    Py_DECREF(v);
3791    return result;
3792
3793onError:
3794    Py_XDECREF(u);
3795    Py_XDECREF(v);
3796    return -1;
3797}
3798
3799int PyUnicode_Contains(PyObject *container,
3800		       PyObject *element)
3801{
3802    PyUnicodeObject *u = NULL, *v = NULL;
3803    int result;
3804    register const Py_UNICODE *p, *e;
3805    register Py_UNICODE ch;
3806
3807    /* Coerce the two arguments */
3808    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3809    if (v == NULL) {
3810	PyErr_SetString(PyExc_TypeError,
3811	    "'in <string>' requires character as left operand");
3812	goto onError;
3813    }
3814    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3815    if (u == NULL) {
3816	Py_DECREF(v);
3817	goto onError;
3818    }
3819
3820    /* Check v in u */
3821    if (PyUnicode_GET_SIZE(v) != 1) {
3822	PyErr_SetString(PyExc_TypeError,
3823	    "'in <string>' requires character as left operand");
3824	goto onError;
3825    }
3826    ch = *PyUnicode_AS_UNICODE(v);
3827    p = PyUnicode_AS_UNICODE(u);
3828    e = p + PyUnicode_GET_SIZE(u);
3829    result = 0;
3830    while (p < e) {
3831	if (*p++ == ch) {
3832	    result = 1;
3833	    break;
3834	}
3835    }
3836
3837    Py_DECREF(u);
3838    Py_DECREF(v);
3839    return result;
3840
3841onError:
3842    Py_XDECREF(u);
3843    Py_XDECREF(v);
3844    return -1;
3845}
3846
3847/* Concat to string or Unicode object giving a new Unicode object. */
3848
3849PyObject *PyUnicode_Concat(PyObject *left,
3850			   PyObject *right)
3851{
3852    PyUnicodeObject *u = NULL, *v = NULL, *w;
3853
3854    /* Coerce the two arguments */
3855    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3856    if (u == NULL)
3857	goto onError;
3858    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3859    if (v == NULL)
3860	goto onError;
3861
3862    /* Shortcuts */
3863    if (v == unicode_empty) {
3864	Py_DECREF(v);
3865	return (PyObject *)u;
3866    }
3867    if (u == unicode_empty) {
3868	Py_DECREF(u);
3869	return (PyObject *)v;
3870    }
3871
3872    /* Concat the two Unicode strings */
3873    w = _PyUnicode_New(u->length + v->length);
3874    if (w == NULL)
3875	goto onError;
3876    Py_UNICODE_COPY(w->str, u->str, u->length);
3877    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3878
3879    Py_DECREF(u);
3880    Py_DECREF(v);
3881    return (PyObject *)w;
3882
3883onError:
3884    Py_XDECREF(u);
3885    Py_XDECREF(v);
3886    return NULL;
3887}
3888
3889static char count__doc__[] =
3890"S.count(sub[, start[, end]]) -> int\n\
3891\n\
3892Return the number of occurrences of substring sub in Unicode string\n\
3893S[start:end].  Optional arguments start and end are\n\
3894interpreted as in slice notation.";
3895
3896static PyObject *
3897unicode_count(PyUnicodeObject *self, PyObject *args)
3898{
3899    PyUnicodeObject *substring;
3900    int start = 0;
3901    int end = INT_MAX;
3902    PyObject *result;
3903
3904    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3905		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3906        return NULL;
3907
3908    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3909						(PyObject *)substring);
3910    if (substring == NULL)
3911	return NULL;
3912
3913    if (start < 0)
3914        start += self->length;
3915    if (start < 0)
3916        start = 0;
3917    if (end > self->length)
3918        end = self->length;
3919    if (end < 0)
3920        end += self->length;
3921    if (end < 0)
3922        end = 0;
3923
3924    result = PyInt_FromLong((long) count(self, start, end, substring));
3925
3926    Py_DECREF(substring);
3927    return result;
3928}
3929
3930static char encode__doc__[] =
3931"S.encode([encoding[,errors]]) -> string\n\
3932\n\
3933Return an encoded string version of S. Default encoding is the current\n\
3934default string encoding. errors may be given to set a different error\n\
3935handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3936a ValueError. Other possible values are 'ignore' and 'replace'.";
3937
3938static PyObject *
3939unicode_encode(PyUnicodeObject *self, PyObject *args)
3940{
3941    char *encoding = NULL;
3942    char *errors = NULL;
3943    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3944        return NULL;
3945    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3946}
3947
3948static char expandtabs__doc__[] =
3949"S.expandtabs([tabsize]) -> unicode\n\
3950\n\
3951Return a copy of S where all tab characters are expanded using spaces.\n\
3952If tabsize is not given, a tab size of 8 characters is assumed.";
3953
3954static PyObject*
3955unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3956{
3957    Py_UNICODE *e;
3958    Py_UNICODE *p;
3959    Py_UNICODE *q;
3960    int i, j;
3961    PyUnicodeObject *u;
3962    int tabsize = 8;
3963
3964    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3965	return NULL;
3966
3967    /* First pass: determine size of output string */
3968    i = j = 0;
3969    e = self->str + self->length;
3970    for (p = self->str; p < e; p++)
3971        if (*p == '\t') {
3972	    if (tabsize > 0)
3973		j += tabsize - (j % tabsize);
3974	}
3975        else {
3976            j++;
3977            if (*p == '\n' || *p == '\r') {
3978                i += j;
3979                j = 0;
3980            }
3981        }
3982
3983    /* Second pass: create output string and fill it */
3984    u = _PyUnicode_New(i + j);
3985    if (!u)
3986        return NULL;
3987
3988    j = 0;
3989    q = u->str;
3990
3991    for (p = self->str; p < e; p++)
3992        if (*p == '\t') {
3993	    if (tabsize > 0) {
3994		i = tabsize - (j % tabsize);
3995		j += i;
3996		while (i--)
3997		    *q++ = ' ';
3998	    }
3999	}
4000	else {
4001            j++;
4002	    *q++ = *p;
4003            if (*p == '\n' || *p == '\r')
4004                j = 0;
4005        }
4006
4007    return (PyObject*) u;
4008}
4009
4010static char find__doc__[] =
4011"S.find(sub [,start [,end]]) -> int\n\
4012\n\
4013Return the lowest index in S where substring sub is found,\n\
4014such that sub is contained within s[start,end].  Optional\n\
4015arguments start and end are interpreted as in slice notation.\n\
4016\n\
4017Return -1 on failure.";
4018
4019static PyObject *
4020unicode_find(PyUnicodeObject *self, PyObject *args)
4021{
4022    PyUnicodeObject *substring;
4023    int start = 0;
4024    int end = INT_MAX;
4025    PyObject *result;
4026
4027    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4028		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4029        return NULL;
4030    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4031						(PyObject *)substring);
4032    if (substring == NULL)
4033	return NULL;
4034
4035    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4036
4037    Py_DECREF(substring);
4038    return result;
4039}
4040
4041static PyObject *
4042unicode_getitem(PyUnicodeObject *self, int index)
4043{
4044    if (index < 0 || index >= self->length) {
4045        PyErr_SetString(PyExc_IndexError, "string index out of range");
4046        return NULL;
4047    }
4048
4049    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4050}
4051
4052static long
4053unicode_hash(PyUnicodeObject *self)
4054{
4055    /* Since Unicode objects compare equal to their ASCII string
4056       counterparts, they should use the individual character values
4057       as basis for their hash value.  This is needed to assure that
4058       strings and Unicode objects behave in the same way as
4059       dictionary keys. */
4060
4061    register int len;
4062    register Py_UNICODE *p;
4063    register long x;
4064
4065    if (self->hash != -1)
4066	return self->hash;
4067    len = PyUnicode_GET_SIZE(self);
4068    p = PyUnicode_AS_UNICODE(self);
4069    x = *p << 7;
4070    while (--len >= 0)
4071	x = (1000003*x) ^ *p++;
4072    x ^= PyUnicode_GET_SIZE(self);
4073    if (x == -1)
4074	x = -2;
4075    self->hash = x;
4076    return x;
4077}
4078
4079static char index__doc__[] =
4080"S.index(sub [,start [,end]]) -> int\n\
4081\n\
4082Like S.find() but raise ValueError when the substring is not found.";
4083
4084static PyObject *
4085unicode_index(PyUnicodeObject *self, PyObject *args)
4086{
4087    int result;
4088    PyUnicodeObject *substring;
4089    int start = 0;
4090    int end = INT_MAX;
4091
4092    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4093		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4094        return NULL;
4095
4096    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4097						(PyObject *)substring);
4098    if (substring == NULL)
4099	return NULL;
4100
4101    result = findstring(self, substring, start, end, 1);
4102
4103    Py_DECREF(substring);
4104    if (result < 0) {
4105        PyErr_SetString(PyExc_ValueError, "substring not found");
4106        return NULL;
4107    }
4108    return PyInt_FromLong(result);
4109}
4110
4111static char islower__doc__[] =
4112"S.islower() -> int\n\
4113\n\
4114Return 1 if  all cased characters in S are lowercase and there is\n\
4115at least one cased character in S, 0 otherwise.";
4116
4117static PyObject*
4118unicode_islower(PyUnicodeObject *self)
4119{
4120    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4121    register const Py_UNICODE *e;
4122    int cased;
4123
4124    /* Shortcut for single character strings */
4125    if (PyUnicode_GET_SIZE(self) == 1)
4126	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4127
4128    /* Special case for empty strings */
4129    if (PyString_GET_SIZE(self) == 0)
4130	return PyInt_FromLong(0);
4131
4132    e = p + PyUnicode_GET_SIZE(self);
4133    cased = 0;
4134    for (; p < e; p++) {
4135	register const Py_UNICODE ch = *p;
4136
4137	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4138	    return PyInt_FromLong(0);
4139	else if (!cased && Py_UNICODE_ISLOWER(ch))
4140	    cased = 1;
4141    }
4142    return PyInt_FromLong(cased);
4143}
4144
4145static char isupper__doc__[] =
4146"S.isupper() -> int\n\
4147\n\
4148Return 1 if  all cased characters in S are uppercase and there is\n\
4149at least one cased character in S, 0 otherwise.";
4150
4151static PyObject*
4152unicode_isupper(PyUnicodeObject *self)
4153{
4154    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4155    register const Py_UNICODE *e;
4156    int cased;
4157
4158    /* Shortcut for single character strings */
4159    if (PyUnicode_GET_SIZE(self) == 1)
4160	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4161
4162    /* Special case for empty strings */
4163    if (PyString_GET_SIZE(self) == 0)
4164	return PyInt_FromLong(0);
4165
4166    e = p + PyUnicode_GET_SIZE(self);
4167    cased = 0;
4168    for (; p < e; p++) {
4169	register const Py_UNICODE ch = *p;
4170
4171	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4172	    return PyInt_FromLong(0);
4173	else if (!cased && Py_UNICODE_ISUPPER(ch))
4174	    cased = 1;
4175    }
4176    return PyInt_FromLong(cased);
4177}
4178
4179static char istitle__doc__[] =
4180"S.istitle() -> int\n\
4181\n\
4182Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4183may only follow uncased characters and lowercase characters only cased\n\
4184ones. Return 0 otherwise.";
4185
4186static PyObject*
4187unicode_istitle(PyUnicodeObject *self)
4188{
4189    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4190    register const Py_UNICODE *e;
4191    int cased, previous_is_cased;
4192
4193    /* Shortcut for single character strings */
4194    if (PyUnicode_GET_SIZE(self) == 1)
4195	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4196			      (Py_UNICODE_ISUPPER(*p) != 0));
4197
4198    /* Special case for empty strings */
4199    if (PyString_GET_SIZE(self) == 0)
4200	return PyInt_FromLong(0);
4201
4202    e = p + PyUnicode_GET_SIZE(self);
4203    cased = 0;
4204    previous_is_cased = 0;
4205    for (; p < e; p++) {
4206	register const Py_UNICODE ch = *p;
4207
4208	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4209	    if (previous_is_cased)
4210		return PyInt_FromLong(0);
4211	    previous_is_cased = 1;
4212	    cased = 1;
4213	}
4214	else if (Py_UNICODE_ISLOWER(ch)) {
4215	    if (!previous_is_cased)
4216		return PyInt_FromLong(0);
4217	    previous_is_cased = 1;
4218	    cased = 1;
4219	}
4220	else
4221	    previous_is_cased = 0;
4222    }
4223    return PyInt_FromLong(cased);
4224}
4225
4226static char isspace__doc__[] =
4227"S.isspace() -> int\n\
4228\n\
4229Return 1 if there are only whitespace characters in S,\n\
42300 otherwise.";
4231
4232static PyObject*
4233unicode_isspace(PyUnicodeObject *self)
4234{
4235    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4236    register const Py_UNICODE *e;
4237
4238    /* Shortcut for single character strings */
4239    if (PyUnicode_GET_SIZE(self) == 1 &&
4240	Py_UNICODE_ISSPACE(*p))
4241	return PyInt_FromLong(1);
4242
4243    /* Special case for empty strings */
4244    if (PyString_GET_SIZE(self) == 0)
4245	return PyInt_FromLong(0);
4246
4247    e = p + PyUnicode_GET_SIZE(self);
4248    for (; p < e; p++) {
4249	if (!Py_UNICODE_ISSPACE(*p))
4250	    return PyInt_FromLong(0);
4251    }
4252    return PyInt_FromLong(1);
4253}
4254
4255static char isalpha__doc__[] =
4256"S.isalpha() -> int\n\
4257\n\
4258Return 1 if  all characters in S are alphabetic\n\
4259and there is at least one character in S, 0 otherwise.";
4260
4261static PyObject*
4262unicode_isalpha(PyUnicodeObject *self)
4263{
4264    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4265    register const Py_UNICODE *e;
4266
4267    /* Shortcut for single character strings */
4268    if (PyUnicode_GET_SIZE(self) == 1 &&
4269	Py_UNICODE_ISALPHA(*p))
4270	return PyInt_FromLong(1);
4271
4272    /* Special case for empty strings */
4273    if (PyString_GET_SIZE(self) == 0)
4274	return PyInt_FromLong(0);
4275
4276    e = p + PyUnicode_GET_SIZE(self);
4277    for (; p < e; p++) {
4278	if (!Py_UNICODE_ISALPHA(*p))
4279	    return PyInt_FromLong(0);
4280    }
4281    return PyInt_FromLong(1);
4282}
4283
4284static char isalnum__doc__[] =
4285"S.isalnum() -> int\n\
4286\n\
4287Return 1 if  all characters in S are alphanumeric\n\
4288and there is at least one character in S, 0 otherwise.";
4289
4290static PyObject*
4291unicode_isalnum(PyUnicodeObject *self)
4292{
4293    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4294    register const Py_UNICODE *e;
4295
4296    /* Shortcut for single character strings */
4297    if (PyUnicode_GET_SIZE(self) == 1 &&
4298	Py_UNICODE_ISALNUM(*p))
4299	return PyInt_FromLong(1);
4300
4301    /* Special case for empty strings */
4302    if (PyString_GET_SIZE(self) == 0)
4303	return PyInt_FromLong(0);
4304
4305    e = p + PyUnicode_GET_SIZE(self);
4306    for (; p < e; p++) {
4307	if (!Py_UNICODE_ISALNUM(*p))
4308	    return PyInt_FromLong(0);
4309    }
4310    return PyInt_FromLong(1);
4311}
4312
4313static char isdecimal__doc__[] =
4314"S.isdecimal() -> int\n\
4315\n\
4316Return 1 if there are only decimal characters in S,\n\
43170 otherwise.";
4318
4319static PyObject*
4320unicode_isdecimal(PyUnicodeObject *self)
4321{
4322    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4323    register const Py_UNICODE *e;
4324
4325    /* Shortcut for single character strings */
4326    if (PyUnicode_GET_SIZE(self) == 1 &&
4327	Py_UNICODE_ISDECIMAL(*p))
4328	return PyInt_FromLong(1);
4329
4330    /* Special case for empty strings */
4331    if (PyString_GET_SIZE(self) == 0)
4332	return PyInt_FromLong(0);
4333
4334    e = p + PyUnicode_GET_SIZE(self);
4335    for (; p < e; p++) {
4336	if (!Py_UNICODE_ISDECIMAL(*p))
4337	    return PyInt_FromLong(0);
4338    }
4339    return PyInt_FromLong(1);
4340}
4341
4342static char isdigit__doc__[] =
4343"S.isdigit() -> int\n\
4344\n\
4345Return 1 if there are only digit characters in S,\n\
43460 otherwise.";
4347
4348static PyObject*
4349unicode_isdigit(PyUnicodeObject *self)
4350{
4351    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4352    register const Py_UNICODE *e;
4353
4354    /* Shortcut for single character strings */
4355    if (PyUnicode_GET_SIZE(self) == 1 &&
4356	Py_UNICODE_ISDIGIT(*p))
4357	return PyInt_FromLong(1);
4358
4359    /* Special case for empty strings */
4360    if (PyString_GET_SIZE(self) == 0)
4361	return PyInt_FromLong(0);
4362
4363    e = p + PyUnicode_GET_SIZE(self);
4364    for (; p < e; p++) {
4365	if (!Py_UNICODE_ISDIGIT(*p))
4366	    return PyInt_FromLong(0);
4367    }
4368    return PyInt_FromLong(1);
4369}
4370
4371static char isnumeric__doc__[] =
4372"S.isnumeric() -> int\n\
4373\n\
4374Return 1 if there are only numeric characters in S,\n\
43750 otherwise.";
4376
4377static PyObject*
4378unicode_isnumeric(PyUnicodeObject *self)
4379{
4380    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4381    register const Py_UNICODE *e;
4382
4383    /* Shortcut for single character strings */
4384    if (PyUnicode_GET_SIZE(self) == 1 &&
4385	Py_UNICODE_ISNUMERIC(*p))
4386	return PyInt_FromLong(1);
4387
4388    /* Special case for empty strings */
4389    if (PyString_GET_SIZE(self) == 0)
4390	return PyInt_FromLong(0);
4391
4392    e = p + PyUnicode_GET_SIZE(self);
4393    for (; p < e; p++) {
4394	if (!Py_UNICODE_ISNUMERIC(*p))
4395	    return PyInt_FromLong(0);
4396    }
4397    return PyInt_FromLong(1);
4398}
4399
4400static char join__doc__[] =
4401"S.join(sequence) -> unicode\n\
4402\n\
4403Return a string which is the concatenation of the strings in the\n\
4404sequence.  The separator between elements is S.";
4405
4406static PyObject*
4407unicode_join(PyObject *self, PyObject *data)
4408{
4409    return PyUnicode_Join(self, data);
4410}
4411
4412static int
4413unicode_length(PyUnicodeObject *self)
4414{
4415    return self->length;
4416}
4417
4418static char ljust__doc__[] =
4419"S.ljust(width) -> unicode\n\
4420\n\
4421Return S left justified in a Unicode string of length width. Padding is\n\
4422done using spaces.";
4423
4424static PyObject *
4425unicode_ljust(PyUnicodeObject *self, PyObject *args)
4426{
4427    int width;
4428    if (!PyArg_ParseTuple(args, "i:ljust", &width))
4429        return NULL;
4430
4431    if (self->length >= width && PyUnicode_CheckExact(self)) {
4432        Py_INCREF(self);
4433        return (PyObject*) self;
4434    }
4435
4436    return (PyObject*) pad(self, 0, width - self->length, ' ');
4437}
4438
4439static char lower__doc__[] =
4440"S.lower() -> unicode\n\
4441\n\
4442Return a copy of the string S converted to lowercase.";
4443
4444static PyObject*
4445unicode_lower(PyUnicodeObject *self)
4446{
4447    return fixup(self, fixlower);
4448}
4449
4450static char lstrip__doc__[] =
4451"S.lstrip() -> unicode\n\
4452\n\
4453Return a copy of the string S with leading whitespace removed.";
4454
4455static PyObject *
4456unicode_lstrip(PyUnicodeObject *self)
4457{
4458    return strip(self, 1, 0);
4459}
4460
4461static PyObject*
4462unicode_repeat(PyUnicodeObject *str, int len)
4463{
4464    PyUnicodeObject *u;
4465    Py_UNICODE *p;
4466    int nchars;
4467    size_t nbytes;
4468
4469    if (len < 0)
4470        len = 0;
4471
4472    if (len == 1 && PyUnicode_CheckExact(str)) {
4473        /* no repeat, return original string */
4474        Py_INCREF(str);
4475        return (PyObject*) str;
4476    }
4477
4478    /* ensure # of chars needed doesn't overflow int and # of bytes
4479     * needed doesn't overflow size_t
4480     */
4481    nchars = len * str->length;
4482    if (len && nchars / len != str->length) {
4483        PyErr_SetString(PyExc_OverflowError,
4484                        "repeated string is too long");
4485        return NULL;
4486    }
4487    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4488    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4489        PyErr_SetString(PyExc_OverflowError,
4490                        "repeated string is too long");
4491        return NULL;
4492    }
4493    u = _PyUnicode_New(nchars);
4494    if (!u)
4495        return NULL;
4496
4497    p = u->str;
4498
4499    while (len-- > 0) {
4500        Py_UNICODE_COPY(p, str->str, str->length);
4501        p += str->length;
4502    }
4503
4504    return (PyObject*) u;
4505}
4506
4507PyObject *PyUnicode_Replace(PyObject *obj,
4508			    PyObject *subobj,
4509			    PyObject *replobj,
4510			    int maxcount)
4511{
4512    PyObject *self;
4513    PyObject *str1;
4514    PyObject *str2;
4515    PyObject *result;
4516
4517    self = PyUnicode_FromObject(obj);
4518    if (self == NULL)
4519	return NULL;
4520    str1 = PyUnicode_FromObject(subobj);
4521    if (str1 == NULL) {
4522	Py_DECREF(self);
4523	return NULL;
4524    }
4525    str2 = PyUnicode_FromObject(replobj);
4526    if (str2 == NULL) {
4527	Py_DECREF(self);
4528	Py_DECREF(str1);
4529	return NULL;
4530    }
4531    result = replace((PyUnicodeObject *)self,
4532		     (PyUnicodeObject *)str1,
4533		     (PyUnicodeObject *)str2,
4534		     maxcount);
4535    Py_DECREF(self);
4536    Py_DECREF(str1);
4537    Py_DECREF(str2);
4538    return result;
4539}
4540
4541static char replace__doc__[] =
4542"S.replace (old, new[, maxsplit]) -> unicode\n\
4543\n\
4544Return a copy of S with all occurrences of substring\n\
4545old replaced by new.  If the optional argument maxsplit is\n\
4546given, only the first maxsplit occurrences are replaced.";
4547
4548static PyObject*
4549unicode_replace(PyUnicodeObject *self, PyObject *args)
4550{
4551    PyUnicodeObject *str1;
4552    PyUnicodeObject *str2;
4553    int maxcount = -1;
4554    PyObject *result;
4555
4556    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4557        return NULL;
4558    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4559    if (str1 == NULL)
4560	return NULL;
4561    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4562    if (str2 == NULL)
4563	return NULL;
4564
4565    result = replace(self, str1, str2, maxcount);
4566
4567    Py_DECREF(str1);
4568    Py_DECREF(str2);
4569    return result;
4570}
4571
4572static
4573PyObject *unicode_repr(PyObject *unicode)
4574{
4575    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4576				PyUnicode_GET_SIZE(unicode),
4577				1);
4578}
4579
4580static char rfind__doc__[] =
4581"S.rfind(sub [,start [,end]]) -> int\n\
4582\n\
4583Return the highest index in S where substring sub is found,\n\
4584such that sub is contained within s[start,end].  Optional\n\
4585arguments start and end are interpreted as in slice notation.\n\
4586\n\
4587Return -1 on failure.";
4588
4589static PyObject *
4590unicode_rfind(PyUnicodeObject *self, PyObject *args)
4591{
4592    PyUnicodeObject *substring;
4593    int start = 0;
4594    int end = INT_MAX;
4595    PyObject *result;
4596
4597    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4598		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4599        return NULL;
4600    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4601						(PyObject *)substring);
4602    if (substring == NULL)
4603	return NULL;
4604
4605    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4606
4607    Py_DECREF(substring);
4608    return result;
4609}
4610
4611static char rindex__doc__[] =
4612"S.rindex(sub [,start [,end]]) -> int\n\
4613\n\
4614Like S.rfind() but raise ValueError when the substring is not found.";
4615
4616static PyObject *
4617unicode_rindex(PyUnicodeObject *self, PyObject *args)
4618{
4619    int result;
4620    PyUnicodeObject *substring;
4621    int start = 0;
4622    int end = INT_MAX;
4623
4624    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4625		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4626        return NULL;
4627    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4628						(PyObject *)substring);
4629    if (substring == NULL)
4630	return NULL;
4631
4632    result = findstring(self, substring, start, end, -1);
4633
4634    Py_DECREF(substring);
4635    if (result < 0) {
4636        PyErr_SetString(PyExc_ValueError, "substring not found");
4637        return NULL;
4638    }
4639    return PyInt_FromLong(result);
4640}
4641
4642static char rjust__doc__[] =
4643"S.rjust(width) -> unicode\n\
4644\n\
4645Return S right justified in a Unicode string of length width. Padding is\n\
4646done using spaces.";
4647
4648static PyObject *
4649unicode_rjust(PyUnicodeObject *self, PyObject *args)
4650{
4651    int width;
4652    if (!PyArg_ParseTuple(args, "i:rjust", &width))
4653        return NULL;
4654
4655    if (self->length >= width && PyUnicode_CheckExact(self)) {
4656        Py_INCREF(self);
4657        return (PyObject*) self;
4658    }
4659
4660    return (PyObject*) pad(self, width - self->length, 0, ' ');
4661}
4662
4663static char rstrip__doc__[] =
4664"S.rstrip() -> unicode\n\
4665\n\
4666Return a copy of the string S with trailing whitespace removed.";
4667
4668static PyObject *
4669unicode_rstrip(PyUnicodeObject *self)
4670{
4671    return strip(self, 0, 1);
4672}
4673
4674static PyObject*
4675unicode_slice(PyUnicodeObject *self, int start, int end)
4676{
4677    /* standard clamping */
4678    if (start < 0)
4679        start = 0;
4680    if (end < 0)
4681        end = 0;
4682    if (end > self->length)
4683        end = self->length;
4684    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
4685        /* full slice, return original string */
4686        Py_INCREF(self);
4687        return (PyObject*) self;
4688    }
4689    if (start > end)
4690        start = end;
4691    /* copy slice */
4692    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4693					     end - start);
4694}
4695
4696PyObject *PyUnicode_Split(PyObject *s,
4697			  PyObject *sep,
4698			  int maxsplit)
4699{
4700    PyObject *result;
4701
4702    s = PyUnicode_FromObject(s);
4703    if (s == NULL)
4704	return NULL;
4705    if (sep != NULL) {
4706	sep = PyUnicode_FromObject(sep);
4707	if (sep == NULL) {
4708	    Py_DECREF(s);
4709	    return NULL;
4710	}
4711    }
4712
4713    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4714
4715    Py_DECREF(s);
4716    Py_XDECREF(sep);
4717    return result;
4718}
4719
4720static char split__doc__[] =
4721"S.split([sep [,maxsplit]]) -> list of strings\n\
4722\n\
4723Return a list of the words in S, using sep as the\n\
4724delimiter string.  If maxsplit is given, at most maxsplit\n\
4725splits are done. If sep is not specified, any whitespace string\n\
4726is a separator.";
4727
4728static PyObject*
4729unicode_split(PyUnicodeObject *self, PyObject *args)
4730{
4731    PyObject *substring = Py_None;
4732    int maxcount = -1;
4733
4734    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4735        return NULL;
4736
4737    if (substring == Py_None)
4738	return split(self, NULL, maxcount);
4739    else if (PyUnicode_Check(substring))
4740	return split(self, (PyUnicodeObject *)substring, maxcount);
4741    else
4742	return PyUnicode_Split((PyObject *)self, substring, maxcount);
4743}
4744
4745static char splitlines__doc__[] =
4746"S.splitlines([keepends]]) -> list of strings\n\
4747\n\
4748Return a list of the lines in S, breaking at line boundaries.\n\
4749Line breaks are not included in the resulting list unless keepends\n\
4750is given and true.";
4751
4752static PyObject*
4753unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4754{
4755    int keepends = 0;
4756
4757    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4758        return NULL;
4759
4760    return PyUnicode_Splitlines((PyObject *)self, keepends);
4761}
4762
4763static
4764PyObject *unicode_str(PyUnicodeObject *self)
4765{
4766    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4767}
4768
4769static char strip__doc__[] =
4770"S.strip() -> unicode\n\
4771\n\
4772Return a copy of S with leading and trailing whitespace removed.";
4773
4774static PyObject *
4775unicode_strip(PyUnicodeObject *self)
4776{
4777    return strip(self, 1, 1);
4778}
4779
4780static char swapcase__doc__[] =
4781"S.swapcase() -> unicode\n\
4782\n\
4783Return a copy of S with uppercase characters converted to lowercase\n\
4784and vice versa.";
4785
4786static PyObject*
4787unicode_swapcase(PyUnicodeObject *self)
4788{
4789    return fixup(self, fixswapcase);
4790}
4791
4792static char translate__doc__[] =
4793"S.translate(table) -> unicode\n\
4794\n\
4795Return a copy of the string S, where all characters have been mapped\n\
4796through the given translation table, which must be a mapping of\n\
4797Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4798are left untouched. Characters mapped to None are deleted.";
4799
4800static PyObject*
4801unicode_translate(PyUnicodeObject *self, PyObject *table)
4802{
4803    return PyUnicode_TranslateCharmap(self->str,
4804				      self->length,
4805				      table,
4806				      "ignore");
4807}
4808
4809static char upper__doc__[] =
4810"S.upper() -> unicode\n\
4811\n\
4812Return a copy of S converted to uppercase.";
4813
4814static PyObject*
4815unicode_upper(PyUnicodeObject *self)
4816{
4817    return fixup(self, fixupper);
4818}
4819
4820#if 0
4821static char zfill__doc__[] =
4822"S.zfill(width) -> unicode\n\
4823\n\
4824Pad a numeric string x with zeros on the left, to fill a field\n\
4825of the specified width. The string x is never truncated.";
4826
4827static PyObject *
4828unicode_zfill(PyUnicodeObject *self, PyObject *args)
4829{
4830    int fill;
4831    PyUnicodeObject *u;
4832
4833    int width;
4834    if (!PyArg_ParseTuple(args, "i:zfill", &width))
4835        return NULL;
4836
4837    if (self->length >= width) {
4838        Py_INCREF(self);
4839        return (PyObject*) self;
4840    }
4841
4842    fill = width - self->length;
4843
4844    u = pad(self, fill, 0, '0');
4845
4846    if (u->str[fill] == '+' || u->str[fill] == '-') {
4847        /* move sign to beginning of string */
4848        u->str[0] = u->str[fill];
4849        u->str[fill] = '0';
4850    }
4851
4852    return (PyObject*) u;
4853}
4854#endif
4855
4856#if 0
4857static PyObject*
4858unicode_freelistsize(PyUnicodeObject *self)
4859{
4860    return PyInt_FromLong(unicode_freelist_size);
4861}
4862#endif
4863
4864static char startswith__doc__[] =
4865"S.startswith(prefix[, start[, end]]) -> int\n\
4866\n\
4867Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4868optional start, test S beginning at that position.  With optional end, stop\n\
4869comparing S at that position.";
4870
4871static PyObject *
4872unicode_startswith(PyUnicodeObject *self,
4873		   PyObject *args)
4874{
4875    PyUnicodeObject *substring;
4876    int start = 0;
4877    int end = INT_MAX;
4878    PyObject *result;
4879
4880    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4881		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4882	return NULL;
4883    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4884						(PyObject *)substring);
4885    if (substring == NULL)
4886	return NULL;
4887
4888    result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4889
4890    Py_DECREF(substring);
4891    return result;
4892}
4893
4894
4895static char endswith__doc__[] =
4896"S.endswith(suffix[, start[, end]]) -> int\n\
4897\n\
4898Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4899optional start, test S beginning at that position.  With optional end, stop\n\
4900comparing S at that position.";
4901
4902static PyObject *
4903unicode_endswith(PyUnicodeObject *self,
4904		 PyObject *args)
4905{
4906    PyUnicodeObject *substring;
4907    int start = 0;
4908    int end = INT_MAX;
4909    PyObject *result;
4910
4911    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4912		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4913	return NULL;
4914    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4915						(PyObject *)substring);
4916    if (substring == NULL)
4917	return NULL;
4918
4919    result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4920
4921    Py_DECREF(substring);
4922    return result;
4923}
4924
4925
4926static PyMethodDef unicode_methods[] = {
4927
4928    /* Order is according to common usage: often used methods should
4929       appear first, since lookup is done sequentially. */
4930
4931    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4932    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4933    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4934    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4935    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4936    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4937    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4938    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4939    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4940    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4941    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4942    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4943    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4944    {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4945/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4946    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4947    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4948    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4949    {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4950    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4951    {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4952    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4953    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4954    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4955    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4956    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4957    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4958    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4959    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4960    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4961    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4962    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4963    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4964    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4965    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
4966#if 0
4967    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4968    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
4969#endif
4970
4971#if 0
4972    /* This one is just used for debugging the implementation. */
4973    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
4974#endif
4975
4976    {NULL, NULL}
4977};
4978
4979static PySequenceMethods unicode_as_sequence = {
4980    (inquiry) unicode_length, 		/* sq_length */
4981    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
4982    (intargfunc) unicode_repeat, 	/* sq_repeat */
4983    (intargfunc) unicode_getitem, 	/* sq_item */
4984    (intintargfunc) unicode_slice, 	/* sq_slice */
4985    0, 					/* sq_ass_item */
4986    0, 					/* sq_ass_slice */
4987    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
4988};
4989
4990static int
4991unicode_buffer_getreadbuf(PyUnicodeObject *self,
4992			  int index,
4993			  const void **ptr)
4994{
4995    if (index != 0) {
4996        PyErr_SetString(PyExc_SystemError,
4997			"accessing non-existent unicode segment");
4998        return -1;
4999    }
5000    *ptr = (void *) self->str;
5001    return PyUnicode_GET_DATA_SIZE(self);
5002}
5003
5004static int
5005unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5006			   const void **ptr)
5007{
5008    PyErr_SetString(PyExc_TypeError,
5009		    "cannot use unicode as modifyable buffer");
5010    return -1;
5011}
5012
5013static int
5014unicode_buffer_getsegcount(PyUnicodeObject *self,
5015			   int *lenp)
5016{
5017    if (lenp)
5018        *lenp = PyUnicode_GET_DATA_SIZE(self);
5019    return 1;
5020}
5021
5022static int
5023unicode_buffer_getcharbuf(PyUnicodeObject *self,
5024			  int index,
5025			  const void **ptr)
5026{
5027    PyObject *str;
5028
5029    if (index != 0) {
5030        PyErr_SetString(PyExc_SystemError,
5031			"accessing non-existent unicode segment");
5032        return -1;
5033    }
5034    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5035    if (str == NULL)
5036	return -1;
5037    *ptr = (void *) PyString_AS_STRING(str);
5038    return PyString_GET_SIZE(str);
5039}
5040
5041/* Helpers for PyUnicode_Format() */
5042
5043static PyObject *
5044getnextarg(PyObject *args, int arglen, int *p_argidx)
5045{
5046    int argidx = *p_argidx;
5047    if (argidx < arglen) {
5048	(*p_argidx)++;
5049	if (arglen < 0)
5050	    return args;
5051	else
5052	    return PyTuple_GetItem(args, argidx);
5053    }
5054    PyErr_SetString(PyExc_TypeError,
5055		    "not enough arguments for format string");
5056    return NULL;
5057}
5058
5059#define F_LJUST (1<<0)
5060#define F_SIGN	(1<<1)
5061#define F_BLANK (1<<2)
5062#define F_ALT	(1<<3)
5063#define F_ZERO	(1<<4)
5064
5065static
5066int usprintf(register Py_UNICODE *buffer, char *format, ...)
5067{
5068    register int i;
5069    int len;
5070    va_list va;
5071    char *charbuffer;
5072    va_start(va, format);
5073
5074    /* First, format the string as char array, then expand to Py_UNICODE
5075       array. */
5076    charbuffer = (char *)buffer;
5077    len = vsprintf(charbuffer, format, va);
5078    for (i = len - 1; i >= 0; i--)
5079	buffer[i] = (Py_UNICODE) charbuffer[i];
5080
5081    va_end(va);
5082    return len;
5083}
5084
5085static int
5086formatfloat(Py_UNICODE *buf,
5087	    size_t buflen,
5088	    int flags,
5089	    int prec,
5090	    int type,
5091	    PyObject *v)
5092{
5093    /* fmt = '%#.' + `prec` + `type`
5094       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5095    char fmt[20];
5096    double x;
5097
5098    x = PyFloat_AsDouble(v);
5099    if (x == -1.0 && PyErr_Occurred())
5100	return -1;
5101    if (prec < 0)
5102	prec = 6;
5103    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5104	type = 'g';
5105    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5106		  (flags & F_ALT) ? "#" : "", prec, type);
5107    /* worst case length calc to ensure no buffer overrun:
5108         fmt = %#.<prec>g
5109         buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5110            for any double rep.)
5111         len = 1 + prec + 1 + 2 + 5 = 9 + prec
5112       If prec=0 the effective precision is 1 (the leading digit is
5113       always given), therefore increase by one to 10+prec. */
5114    if (buflen <= (size_t)10 + (size_t)prec) {
5115	PyErr_SetString(PyExc_OverflowError,
5116	    "formatted float is too long (precision too long?)");
5117	return -1;
5118    }
5119    return usprintf(buf, fmt, x);
5120}
5121
5122static PyObject*
5123formatlong(PyObject *val, int flags, int prec, int type)
5124{
5125	char *buf;
5126	int i, len;
5127	PyObject *str; /* temporary string object. */
5128	PyUnicodeObject *result;
5129
5130	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5131	if (!str)
5132		return NULL;
5133	result = _PyUnicode_New(len);
5134	for (i = 0; i < len; i++)
5135		result->str[i] = buf[i];
5136	result->str[len] = 0;
5137	Py_DECREF(str);
5138	return (PyObject*)result;
5139}
5140
5141static int
5142formatint(Py_UNICODE *buf,
5143	  size_t buflen,
5144	  int flags,
5145	  int prec,
5146	  int type,
5147	  PyObject *v)
5148{
5149    /* fmt = '%#.' + `prec` + 'l' + `type`
5150     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5151     *                     + 1 + 1
5152     *                   = 24
5153     */
5154    char fmt[64]; /* plenty big enough! */
5155    long x;
5156
5157    x = PyInt_AsLong(v);
5158    if (x == -1 && PyErr_Occurred())
5159        return -1;
5160    if (prec < 0)
5161        prec = 1;
5162
5163    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5164     * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5165     */
5166    if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
5167        PyErr_SetString(PyExc_OverflowError,
5168    	        "formatted integer is too long (precision too large?)");
5169        return -1;
5170    }
5171
5172    if ((flags & F_ALT) &&
5173        (type == 'x' || type == 'X')) {
5174        /* When converting under %#x or %#X, there are a number
5175         * of issues that cause pain:
5176         * - when 0 is being converted, the C standard leaves off
5177         *   the '0x' or '0X', which is inconsistent with other
5178         *   %#x/%#X conversions and inconsistent with Python's
5179         *   hex() function
5180         * - there are platforms that violate the standard and
5181         *   convert 0 with the '0x' or '0X'
5182         *   (Metrowerks, Compaq Tru64)
5183         * - there are platforms that give '0x' when converting
5184         *   under %#X, but convert 0 in accordance with the
5185         *   standard (OS/2 EMX)
5186         *
5187         * We can achieve the desired consistency by inserting our
5188         * own '0x' or '0X' prefix, and substituting %x/%X in place
5189         * of %#x/%#X.
5190         *
5191         * Note that this is the same approach as used in
5192         * formatint() in stringobject.c
5193         */
5194        PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5195                      type, prec, type);
5196    }
5197    else {
5198        PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5199                      (flags&F_ALT) ? "#" : "",
5200                      prec, type);
5201    }
5202    return usprintf(buf, fmt, x);
5203}
5204
5205static int
5206formatchar(Py_UNICODE *buf,
5207           size_t buflen,
5208           PyObject *v)
5209{
5210    /* presume that the buffer is at least 2 characters long */
5211    if (PyUnicode_Check(v)) {
5212	if (PyUnicode_GET_SIZE(v) != 1)
5213	    goto onError;
5214	buf[0] = PyUnicode_AS_UNICODE(v)[0];
5215    }
5216
5217    else if (PyString_Check(v)) {
5218	if (PyString_GET_SIZE(v) != 1)
5219	    goto onError;
5220	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5221    }
5222
5223    else {
5224	/* Integer input truncated to a character */
5225        long x;
5226	x = PyInt_AsLong(v);
5227	if (x == -1 && PyErr_Occurred())
5228	    goto onError;
5229	buf[0] = (char) x;
5230    }
5231    buf[1] = '\0';
5232    return 1;
5233
5234 onError:
5235    PyErr_SetString(PyExc_TypeError,
5236		    "%c requires int or char");
5237    return -1;
5238}
5239
5240/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5241
5242   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5243   chars are formatted. XXX This is a magic number. Each formatting
5244   routine does bounds checking to ensure no overflow, but a better
5245   solution may be to malloc a buffer of appropriate size for each
5246   format. For now, the current solution is sufficient.
5247*/
5248#define FORMATBUFLEN (size_t)120
5249
5250PyObject *PyUnicode_Format(PyObject *format,
5251			   PyObject *args)
5252{
5253    Py_UNICODE *fmt, *res;
5254    int fmtcnt, rescnt, reslen, arglen, argidx;
5255    int args_owned = 0;
5256    PyUnicodeObject *result = NULL;
5257    PyObject *dict = NULL;
5258    PyObject *uformat;
5259
5260    if (format == NULL || args == NULL) {
5261	PyErr_BadInternalCall();
5262	return NULL;
5263    }
5264    uformat = PyUnicode_FromObject(format);
5265    if (uformat == NULL)
5266	return NULL;
5267    fmt = PyUnicode_AS_UNICODE(uformat);
5268    fmtcnt = PyUnicode_GET_SIZE(uformat);
5269
5270    reslen = rescnt = fmtcnt + 100;
5271    result = _PyUnicode_New(reslen);
5272    if (result == NULL)
5273	goto onError;
5274    res = PyUnicode_AS_UNICODE(result);
5275
5276    if (PyTuple_Check(args)) {
5277	arglen = PyTuple_Size(args);
5278	argidx = 0;
5279    }
5280    else {
5281	arglen = -1;
5282	argidx = -2;
5283    }
5284    if (args->ob_type->tp_as_mapping)
5285	dict = args;
5286
5287    while (--fmtcnt >= 0) {
5288	if (*fmt != '%') {
5289	    if (--rescnt < 0) {
5290		rescnt = fmtcnt + 100;
5291		reslen += rescnt;
5292		if (_PyUnicode_Resize(&result, reslen) < 0)
5293		    return NULL;
5294		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5295		--rescnt;
5296	    }
5297	    *res++ = *fmt++;
5298	}
5299	else {
5300	    /* Got a format specifier */
5301	    int flags = 0;
5302	    int width = -1;
5303	    int prec = -1;
5304	    Py_UNICODE c = '\0';
5305	    Py_UNICODE fill;
5306	    PyObject *v = NULL;
5307	    PyObject *temp = NULL;
5308	    Py_UNICODE *pbuf;
5309	    Py_UNICODE sign;
5310	    int len;
5311	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5312
5313	    fmt++;
5314	    if (*fmt == '(') {
5315		Py_UNICODE *keystart;
5316		int keylen;
5317		PyObject *key;
5318		int pcount = 1;
5319
5320		if (dict == NULL) {
5321		    PyErr_SetString(PyExc_TypeError,
5322				    "format requires a mapping");
5323		    goto onError;
5324		}
5325		++fmt;
5326		--fmtcnt;
5327		keystart = fmt;
5328		/* Skip over balanced parentheses */
5329		while (pcount > 0 && --fmtcnt >= 0) {
5330		    if (*fmt == ')')
5331			--pcount;
5332		    else if (*fmt == '(')
5333			++pcount;
5334		    fmt++;
5335		}
5336		keylen = fmt - keystart - 1;
5337		if (fmtcnt < 0 || pcount > 0) {
5338		    PyErr_SetString(PyExc_ValueError,
5339				    "incomplete format key");
5340		    goto onError;
5341		}
5342#if 0
5343		/* keys are converted to strings using UTF-8 and
5344		   then looked up since Python uses strings to hold
5345		   variables names etc. in its namespaces and we
5346		   wouldn't want to break common idioms. */
5347		key = PyUnicode_EncodeUTF8(keystart,
5348					   keylen,
5349					   NULL);
5350#else
5351		key = PyUnicode_FromUnicode(keystart, keylen);
5352#endif
5353		if (key == NULL)
5354		    goto onError;
5355		if (args_owned) {
5356		    Py_DECREF(args);
5357		    args_owned = 0;
5358		}
5359		args = PyObject_GetItem(dict, key);
5360		Py_DECREF(key);
5361		if (args == NULL) {
5362		    goto onError;
5363		}
5364		args_owned = 1;
5365		arglen = -1;
5366		argidx = -2;
5367	    }
5368	    while (--fmtcnt >= 0) {
5369		switch (c = *fmt++) {
5370		case '-': flags |= F_LJUST; continue;
5371		case '+': flags |= F_SIGN; continue;
5372		case ' ': flags |= F_BLANK; continue;
5373		case '#': flags |= F_ALT; continue;
5374		case '0': flags |= F_ZERO; continue;
5375		}
5376		break;
5377	    }
5378	    if (c == '*') {
5379		v = getnextarg(args, arglen, &argidx);
5380		if (v == NULL)
5381		    goto onError;
5382		if (!PyInt_Check(v)) {
5383		    PyErr_SetString(PyExc_TypeError,
5384				    "* wants int");
5385		    goto onError;
5386		}
5387		width = PyInt_AsLong(v);
5388		if (width < 0) {
5389		    flags |= F_LJUST;
5390		    width = -width;
5391		}
5392		if (--fmtcnt >= 0)
5393		    c = *fmt++;
5394	    }
5395	    else if (c >= '0' && c <= '9') {
5396		width = c - '0';
5397		while (--fmtcnt >= 0) {
5398		    c = *fmt++;
5399		    if (c < '0' || c > '9')
5400			break;
5401		    if ((width*10) / 10 != width) {
5402			PyErr_SetString(PyExc_ValueError,
5403					"width too big");
5404			goto onError;
5405		    }
5406		    width = width*10 + (c - '0');
5407		}
5408	    }
5409	    if (c == '.') {
5410		prec = 0;
5411		if (--fmtcnt >= 0)
5412		    c = *fmt++;
5413		if (c == '*') {
5414		    v = getnextarg(args, arglen, &argidx);
5415		    if (v == NULL)
5416			goto onError;
5417		    if (!PyInt_Check(v)) {
5418			PyErr_SetString(PyExc_TypeError,
5419					"* wants int");
5420			goto onError;
5421		    }
5422		    prec = PyInt_AsLong(v);
5423		    if (prec < 0)
5424			prec = 0;
5425		    if (--fmtcnt >= 0)
5426			c = *fmt++;
5427		}
5428		else if (c >= '0' && c <= '9') {
5429		    prec = c - '0';
5430		    while (--fmtcnt >= 0) {
5431			c = Py_CHARMASK(*fmt++);
5432			if (c < '0' || c > '9')
5433			    break;
5434			if ((prec*10) / 10 != prec) {
5435			    PyErr_SetString(PyExc_ValueError,
5436					    "prec too big");
5437			    goto onError;
5438			}
5439			prec = prec*10 + (c - '0');
5440		    }
5441		}
5442	    } /* prec */
5443	    if (fmtcnt >= 0) {
5444		if (c == 'h' || c == 'l' || c == 'L') {
5445		    if (--fmtcnt >= 0)
5446			c = *fmt++;
5447		}
5448	    }
5449	    if (fmtcnt < 0) {
5450		PyErr_SetString(PyExc_ValueError,
5451				"incomplete format");
5452		goto onError;
5453	    }
5454	    if (c != '%') {
5455		v = getnextarg(args, arglen, &argidx);
5456		if (v == NULL)
5457		    goto onError;
5458	    }
5459	    sign = 0;
5460	    fill = ' ';
5461	    switch (c) {
5462
5463	    case '%':
5464		pbuf = formatbuf;
5465		/* presume that buffer length is at least 1 */
5466		pbuf[0] = '%';
5467		len = 1;
5468		break;
5469
5470	    case 's':
5471	    case 'r':
5472		if (PyUnicode_Check(v) && c == 's') {
5473		    temp = v;
5474		    Py_INCREF(temp);
5475		}
5476		else {
5477		    PyObject *unicode;
5478		    if (c == 's')
5479			temp = PyObject_Str(v);
5480		    else
5481			temp = PyObject_Repr(v);
5482		    if (temp == NULL)
5483			goto onError;
5484		    if (!PyString_Check(temp)) {
5485			/* XXX Note: this should never happen, since
5486   			       PyObject_Repr() and PyObject_Str() assure
5487			       this */
5488			Py_DECREF(temp);
5489			PyErr_SetString(PyExc_TypeError,
5490					"%s argument has non-string str()");
5491			goto onError;
5492		    }
5493		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5494						   PyString_GET_SIZE(temp),
5495					       NULL,
5496						   "strict");
5497		    Py_DECREF(temp);
5498		    temp = unicode;
5499		    if (temp == NULL)
5500			goto onError;
5501		}
5502		pbuf = PyUnicode_AS_UNICODE(temp);
5503		len = PyUnicode_GET_SIZE(temp);
5504		if (prec >= 0 && len > prec)
5505		    len = prec;
5506		break;
5507
5508	    case 'i':
5509	    case 'd':
5510	    case 'u':
5511	    case 'o':
5512	    case 'x':
5513	    case 'X':
5514		if (c == 'i')
5515		    c = 'd';
5516		if (PyLong_Check(v)) {
5517		    temp = formatlong(v, flags, prec, c);
5518		    if (!temp)
5519			goto onError;
5520		    pbuf = PyUnicode_AS_UNICODE(temp);
5521		    len = PyUnicode_GET_SIZE(temp);
5522		    /* unbounded ints can always produce
5523		       a sign character! */
5524		    sign = 1;
5525		}
5526		else {
5527		    pbuf = formatbuf;
5528		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5529				    flags, prec, c, v);
5530		    if (len < 0)
5531			goto onError;
5532		    /* only d conversion is signed */
5533		    sign = c == 'd';
5534		}
5535		if (flags & F_ZERO)
5536		    fill = '0';
5537		break;
5538
5539	    case 'e':
5540	    case 'E':
5541	    case 'f':
5542	    case 'g':
5543	    case 'G':
5544		pbuf = formatbuf;
5545		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5546			flags, prec, c, v);
5547		if (len < 0)
5548		    goto onError;
5549		sign = 1;
5550		if (flags & F_ZERO)
5551		    fill = '0';
5552		break;
5553
5554	    case 'c':
5555		pbuf = formatbuf;
5556		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5557		if (len < 0)
5558		    goto onError;
5559		break;
5560
5561	    default:
5562		PyErr_Format(PyExc_ValueError,
5563			     "unsupported format character '%c' (0x%x) "
5564			     "at index %i",
5565			     (31<=c && c<=126) ? c : '?',
5566                             c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5567		goto onError;
5568	    }
5569	    if (sign) {
5570		if (*pbuf == '-' || *pbuf == '+') {
5571		    sign = *pbuf++;
5572		    len--;
5573		}
5574		else if (flags & F_SIGN)
5575		    sign = '+';
5576		else if (flags & F_BLANK)
5577		    sign = ' ';
5578		else
5579		    sign = 0;
5580	    }
5581	    if (width < len)
5582		width = len;
5583	    if (rescnt < width + (sign != 0)) {
5584		reslen -= rescnt;
5585		rescnt = width + fmtcnt + 100;
5586		reslen += rescnt;
5587		if (_PyUnicode_Resize(&result, reslen) < 0)
5588		    return NULL;
5589		res = PyUnicode_AS_UNICODE(result)
5590		    + reslen - rescnt;
5591	    }
5592	    if (sign) {
5593		if (fill != ' ')
5594		    *res++ = sign;
5595		rescnt--;
5596		if (width > len)
5597		    width--;
5598	    }
5599	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5600		assert(pbuf[0] == '0');
5601		assert(pbuf[1] == c);
5602		if (fill != ' ') {
5603		    *res++ = *pbuf++;
5604		    *res++ = *pbuf++;
5605		}
5606		rescnt -= 2;
5607		width -= 2;
5608		if (width < 0)
5609		    width = 0;
5610		len -= 2;
5611	    }
5612	    if (width > len && !(flags & F_LJUST)) {
5613		do {
5614		    --rescnt;
5615		    *res++ = fill;
5616		} while (--width > len);
5617	    }
5618	    if (fill == ' ') {
5619		if (sign)
5620		    *res++ = sign;
5621		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5622		    assert(pbuf[0] == '0');
5623		    assert(pbuf[1] == c);
5624		    *res++ = *pbuf++;
5625		    *res++ = *pbuf++;
5626		}
5627	    }
5628	    Py_UNICODE_COPY(res, pbuf, len);
5629	    res += len;
5630	    rescnt -= len;
5631	    while (--width >= len) {
5632		--rescnt;
5633		*res++ = ' ';
5634	    }
5635	    if (dict && (argidx < arglen) && c != '%') {
5636		PyErr_SetString(PyExc_TypeError,
5637				"not all arguments converted");
5638		goto onError;
5639	    }
5640	    Py_XDECREF(temp);
5641	} /* '%' */
5642    } /* until end */
5643    if (argidx < arglen && !dict) {
5644	PyErr_SetString(PyExc_TypeError,
5645			"not all arguments converted");
5646	goto onError;
5647    }
5648
5649    if (args_owned) {
5650	Py_DECREF(args);
5651    }
5652    Py_DECREF(uformat);
5653    if (_PyUnicode_Resize(&result, reslen - rescnt))
5654	goto onError;
5655    return (PyObject *)result;
5656
5657 onError:
5658    Py_XDECREF(result);
5659    Py_DECREF(uformat);
5660    if (args_owned) {
5661	Py_DECREF(args);
5662    }
5663    return NULL;
5664}
5665
5666static PyBufferProcs unicode_as_buffer = {
5667    (getreadbufferproc) unicode_buffer_getreadbuf,
5668    (getwritebufferproc) unicode_buffer_getwritebuf,
5669    (getsegcountproc) unicode_buffer_getsegcount,
5670    (getcharbufferproc) unicode_buffer_getcharbuf,
5671};
5672
5673staticforward PyObject *
5674unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5675
5676static PyObject *
5677unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5678{
5679        PyObject *x = NULL;
5680	static char *kwlist[] = {"string", "encoding", "errors", 0};
5681	char *encoding = NULL;
5682	char *errors = NULL;
5683
5684	if (type != &PyUnicode_Type)
5685		return unicode_subtype_new(type, args, kwds);
5686	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5687					  kwlist, &x, &encoding, &errors))
5688	    return NULL;
5689	if (x == NULL)
5690		return (PyObject *)_PyUnicode_New(0);
5691	if (encoding == NULL && errors == NULL)
5692	    return PyObject_Unicode(x);
5693	else
5694	return PyUnicode_FromEncodedObject(x, encoding, errors);
5695}
5696
5697static PyObject *
5698unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5699{
5700	PyUnicodeObject *tmp, *pnew;
5701	int n;
5702
5703	assert(PyType_IsSubtype(type, &PyUnicode_Type));
5704	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5705	if (tmp == NULL)
5706		return NULL;
5707	assert(PyUnicode_Check(tmp));
5708	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5709	if (pnew == NULL)
5710		return NULL;
5711	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5712	if (pnew->str == NULL) {
5713		_Py_ForgetReference((PyObject *)pnew);
5714		PyMalloc_Del(pnew);
5715		return NULL;
5716	}
5717	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5718	pnew->length = n;
5719	pnew->hash = tmp->hash;
5720	Py_DECREF(tmp);
5721	return (PyObject *)pnew;
5722}
5723
5724static char unicode_doc[] =
5725"unicode(string [, encoding[, errors]]) -> object\n\
5726\n\
5727Create a new Unicode object from the given encoded string.\n\
5728encoding defaults to the current default string encoding and \n\
5729errors, defining the error handling, to 'strict'.";
5730
5731PyTypeObject PyUnicode_Type = {
5732    PyObject_HEAD_INIT(&PyType_Type)
5733    0, 					/* ob_size */
5734    "unicode", 				/* tp_name */
5735    sizeof(PyUnicodeObject), 		/* tp_size */
5736    0, 					/* tp_itemsize */
5737    /* Slots */
5738    (destructor)unicode_dealloc, 	/* tp_dealloc */
5739    0, 					/* tp_print */
5740    0,				 	/* tp_getattr */
5741    0, 					/* tp_setattr */
5742    (cmpfunc) unicode_compare, 		/* tp_compare */
5743    (reprfunc) unicode_repr, 		/* tp_repr */
5744    0, 					/* tp_as_number */
5745    &unicode_as_sequence, 		/* tp_as_sequence */
5746    0, 					/* tp_as_mapping */
5747    (hashfunc) unicode_hash, 		/* tp_hash*/
5748    0, 					/* tp_call*/
5749    (reprfunc) unicode_str,	 	/* tp_str */
5750    PyObject_GenericGetAttr, 		/* tp_getattro */
5751    0,			 		/* tp_setattro */
5752    &unicode_as_buffer,			/* tp_as_buffer */
5753    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
5754    unicode_doc,			/* tp_doc */
5755    0,					/* tp_traverse */
5756    0,					/* tp_clear */
5757    0,					/* tp_richcompare */
5758    0,					/* tp_weaklistoffset */
5759    0,					/* tp_iter */
5760    0,					/* tp_iternext */
5761    unicode_methods,			/* tp_methods */
5762    0,					/* tp_members */
5763    0,					/* tp_getset */
5764    0,					/* tp_base */
5765    0,					/* tp_dict */
5766    0,					/* tp_descr_get */
5767    0,					/* tp_descr_set */
5768    0,					/* tp_dictoffset */
5769    0,					/* tp_init */
5770    0,					/* tp_alloc */
5771    unicode_new,			/* tp_new */
5772    _PyMalloc_Del,			/* tp_free */
5773};
5774
5775/* Initialize the Unicode implementation */
5776
5777void _PyUnicode_Init(void)
5778{
5779    int i;
5780
5781    /* Init the implementation */
5782    unicode_freelist = NULL;
5783    unicode_freelist_size = 0;
5784    unicode_empty = _PyUnicode_New(0);
5785    strcpy(unicode_default_encoding, "ascii");
5786    for (i = 0; i < 256; i++)
5787	unicode_latin1[i] = NULL;
5788}
5789
5790/* Finalize the Unicode implementation */
5791
5792void
5793_PyUnicode_Fini(void)
5794{
5795    PyUnicodeObject *u;
5796    int i;
5797
5798    Py_XDECREF(unicode_empty);
5799    unicode_empty = NULL;
5800
5801    for (i = 0; i < 256; i++) {
5802	if (unicode_latin1[i]) {
5803	    Py_DECREF(unicode_latin1[i]);
5804	    unicode_latin1[i] = NULL;
5805	}
5806    }
5807
5808    for (u = unicode_freelist; u != NULL;) {
5809	PyUnicodeObject *v = u;
5810	u = *(PyUnicodeObject **)u;
5811	if (v->str)
5812	    PyMem_DEL(v->str);
5813	Py_XDECREF(v->defenc);
5814	PyMalloc_Del(v);
5815    }
5816    unicode_freelist = NULL;
5817    unicode_freelist_size = 0;
5818}
5819