unicodeobject.c revision 0ebac97058baad8250adf710f287e8fb8770f7fa
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9--------------------------------------------------------------------
10The original string type implementation is:
11
12    Copyright (c) 1999 by Secret Labs AB
13    Copyright (c) 1999 by Fredrik Lundh
14
15By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
38
39#include "Python.h"
40
41#include "unicodeobject.h"
42#include "ucnhash.h"
43
44#ifdef MS_WIN32
45#include <windows.h>
46#endif
47
48/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE       1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54   The implementation will keep allocated Unicode memory intact for
55   all objects on the free list having a size less than this
56   limit. This reduces malloc() overhead for small Unicode objects.
57
58   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60   malloc()-overhead) bytes of unused garbage.
61
62   Setting the limit to 0 effectively turns the feature off.
63
64   Note: This is an experimental feature ! If you get core dumps when
65   using Unicode objects, turn this feature off.
66
67*/
68
69#define KEEPALIVE_SIZE_LIMIT       9
70
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
79/* --- Globals ------------------------------------------------------------
80
81   The globals are initialized by the _PyUnicode_Init() API and should
82   not be used before calling that API.
83
84*/
85
86/* Free list for Unicode objects */
87static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
89
90/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94   shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
97/* Default encoding to use and assume when NULL is passed as encoding
98   parameter; it is initialized by _PyUnicode_Init().
99
100   Always use the PyUnicode_SetDefaultEncoding() and
101   PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
104static char unicode_default_encoding[100];
105
106Py_UNICODE
107PyUnicode_GetMax(void)
108{
109#ifdef Py_UNICODE_WIDE
110	return 0x10FFFF;
111#else
112	/* This is actually an illegal character, so it should
113	   not be passed to unichr. */
114	return 0xFFFF;
115#endif
116}
117
118/* --- Unicode Object ----------------------------------------------------- */
119
120static
121int unicode_resize(register PyUnicodeObject *unicode,
122                      int length)
123{
124    void *oldstr;
125
126    /* Shortcut if there's nothing much to do. */
127    if (unicode->length == length)
128	goto reset;
129
130    /* Resizing shared object (unicode_empty or single character
131       objects) in-place is not allowed. Use PyUnicode_Resize()
132       instead ! */
133    if (unicode == unicode_empty ||
134	(unicode->length == 1 &&
135	 unicode->str[0] < 256 &&
136	 unicode_latin1[unicode->str[0]] == unicode)) {
137        PyErr_SetString(PyExc_SystemError,
138                        "can't resize shared unicode objects");
139        return -1;
140    }
141
142    /* We allocate one more byte to make sure the string is
143       Ux0000 terminated -- XXX is this needed ? */
144    oldstr = unicode->str;
145    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146    if (!unicode->str) {
147	unicode->str = oldstr;
148        PyErr_NoMemory();
149        return -1;
150    }
151    unicode->str[length] = 0;
152    unicode->length = length;
153
154 reset:
155    /* Reset the object caches */
156    if (unicode->defenc) {
157        Py_DECREF(unicode->defenc);
158        unicode->defenc = NULL;
159    }
160    unicode->hash = -1;
161
162    return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166   Ux0000 terminated -- XXX is this needed ?
167
168   XXX This allocator could further be enhanced by assuring that the
169       free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176    register PyUnicodeObject *unicode;
177
178    /* Optimization for empty strings */
179    if (length == 0 && unicode_empty != NULL) {
180        Py_INCREF(unicode_empty);
181        return unicode_empty;
182    }
183
184    /* Unicode freelist & memory allocation */
185    if (unicode_freelist) {
186        unicode = unicode_freelist;
187        unicode_freelist = *(PyUnicodeObject **)unicode;
188        unicode_freelist_size--;
189	if (unicode->str) {
190	    /* Keep-Alive optimization: we only upsize the buffer,
191	       never downsize it. */
192	    if ((unicode->length < length) &&
193		unicode_resize(unicode, length)) {
194		PyMem_DEL(unicode->str);
195		goto onError;
196	    }
197	}
198        else {
199	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200        }
201        PyObject_INIT(unicode, &PyUnicode_Type);
202    }
203    else {
204        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
205        if (unicode == NULL)
206            return NULL;
207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208    }
209
210    if (!unicode->str) {
211	PyErr_NoMemory();
212	goto onError;
213    }
214    unicode->str[length] = 0;
215    unicode->length = length;
216    unicode->hash = -1;
217    unicode->defenc = NULL;
218    return unicode;
219
220 onError:
221    _Py_ForgetReference((PyObject *)unicode);
222    PyObject_Del(unicode);
223    return NULL;
224}
225
226static
227void unicode_dealloc(register PyUnicodeObject *unicode)
228{
229    if (PyUnicode_CheckExact(unicode) &&
230	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
231        /* Keep-Alive optimization */
232	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
233	    PyMem_DEL(unicode->str);
234	    unicode->str = NULL;
235	    unicode->length = 0;
236	}
237	if (unicode->defenc) {
238	    Py_DECREF(unicode->defenc);
239	    unicode->defenc = NULL;
240	}
241	/* Add to free list */
242        *(PyUnicodeObject **)unicode = unicode_freelist;
243        unicode_freelist = unicode;
244        unicode_freelist_size++;
245    }
246    else {
247	PyMem_DEL(unicode->str);
248	Py_XDECREF(unicode->defenc);
249	unicode->ob_type->tp_free((PyObject *)unicode);
250    }
251}
252
253int PyUnicode_Resize(PyObject **unicode,
254		     int length)
255{
256    register PyUnicodeObject *v;
257
258    /* Argument checks */
259    if (unicode == NULL) {
260	PyErr_BadInternalCall();
261	return -1;
262    }
263    v = (PyUnicodeObject *)*unicode;
264    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265	PyErr_BadInternalCall();
266	return -1;
267    }
268
269    /* Resizing unicode_empty and single character objects is not
270       possible since these are being shared. We simply return a fresh
271       copy with the same Unicode content. */
272    if (v->length != length &&
273	(v == unicode_empty || v->length == 1)) {
274	PyUnicodeObject *w = _PyUnicode_New(length);
275	if (w == NULL)
276	    return -1;
277	Py_UNICODE_COPY(w->str, v->str,
278			length < v->length ? length : v->length);
279	*unicode = (PyObject *)w;
280	return 0;
281    }
282
283    /* Note that we don't have to modify *unicode for unshared Unicode
284       objects, since we can modify them in-place. */
285    return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293				int size)
294{
295    PyUnicodeObject *unicode;
296
297    /* If the Unicode data is known at construction time, we can apply
298       some optimizations which share commonly used objects. */
299    if (u != NULL) {
300
301	/* Optimization for empty strings */
302	if (size == 0 && unicode_empty != NULL) {
303	    Py_INCREF(unicode_empty);
304	    return (PyObject *)unicode_empty;
305	}
306
307	/* Single character Unicode objects in the Latin-1 range are
308	   shared when using this constructor */
309	if (size == 1 && *u < 256) {
310	    unicode = unicode_latin1[*u];
311	    if (!unicode) {
312		unicode = _PyUnicode_New(1);
313		if (!unicode)
314		    return NULL;
315		unicode->str[0] = *u;
316		unicode_latin1[*u] = unicode;
317	    }
318	    Py_INCREF(unicode);
319	    return (PyObject *)unicode;
320	}
321    }
322
323    unicode = _PyUnicode_New(size);
324    if (!unicode)
325        return NULL;
326
327    /* Copy the Unicode data into the new object */
328    if (u != NULL)
329	Py_UNICODE_COPY(unicode->str, u, size);
330
331    return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337				 int size)
338{
339    PyUnicodeObject *unicode;
340
341    if (w == NULL) {
342	PyErr_BadInternalCall();
343	return NULL;
344    }
345
346    unicode = _PyUnicode_New(size);
347    if (!unicode)
348        return NULL;
349
350    /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352    memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354    {
355	register Py_UNICODE *u;
356	register int i;
357	u = PyUnicode_AS_UNICODE(unicode);
358	for (i = size; i >= 0; i--)
359	    *u++ = *w++;
360    }
361#endif
362
363    return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367			 register wchar_t *w,
368			 int size)
369{
370    if (unicode == NULL) {
371	PyErr_BadInternalCall();
372	return -1;
373    }
374    if (size > PyUnicode_GET_SIZE(unicode))
375	size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377    memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379    {
380	register Py_UNICODE *u;
381	register int i;
382	u = PyUnicode_AS_UNICODE(unicode);
383	for (i = size; i >= 0; i--)
384	    *w++ = *u++;
385    }
386#endif
387
388    return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
395    /* XXX Perhaps we should make this API an alias of
396           PyObject_Unicode() instead ?! */
397    if (PyUnicode_CheckExact(obj)) {
398	Py_INCREF(obj);
399	return obj;
400    }
401    if (PyUnicode_Check(obj)) {
402	/* For a Unicode subtype that's not a Unicode object,
403	   return a true Unicode object with the same data. */
404	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405				     PyUnicode_GET_SIZE(obj));
406    }
407    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411				      const char *encoding,
412				      const char *errors)
413{
414    const char *s = NULL;
415    int len;
416    int owned = 0;
417    PyObject *v;
418
419    if (obj == NULL) {
420	PyErr_BadInternalCall();
421	return NULL;
422    }
423
424#if 0
425    /* For b/w compatibility we also accept Unicode objects provided
426       that no encodings is given and then redirect to
427       PyObject_Unicode() which then applies the additional logic for
428       Unicode subclasses.
429
430       NOTE: This API should really only be used for object which
431             represent *encoded* Unicode !
432
433    */
434	if (PyUnicode_Check(obj)) {
435	    if (encoding) {
436		PyErr_SetString(PyExc_TypeError,
437				"decoding Unicode is not supported");
438	    return NULL;
439	    }
440	return PyObject_Unicode(obj);
441	    }
442#else
443    if (PyUnicode_Check(obj)) {
444	PyErr_SetString(PyExc_TypeError,
445			"decoding Unicode is not supported");
446	return NULL;
447	}
448#endif
449
450    /* Coerce object */
451    if (PyString_Check(obj)) {
452	    s = PyString_AS_STRING(obj);
453	    len = PyString_GET_SIZE(obj);
454	    }
455    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456	/* Overwrite the error message with something more useful in
457	   case of a TypeError. */
458	if (PyErr_ExceptionMatches(PyExc_TypeError))
459	PyErr_Format(PyExc_TypeError,
460			 "coercing to Unicode: need string or buffer, "
461			 "%.80s found",
462		     obj->ob_type->tp_name);
463	goto onError;
464    }
465
466    /* Convert to Unicode */
467    if (len == 0) {
468	Py_INCREF(unicode_empty);
469	v = (PyObject *)unicode_empty;
470    }
471    else
472	v = PyUnicode_Decode(s, len, encoding, errors);
473
474    if (owned) {
475	Py_DECREF(obj);
476    }
477    return v;
478
479 onError:
480    if (owned) {
481	Py_DECREF(obj);
482    }
483    return NULL;
484}
485
486PyObject *PyUnicode_Decode(const char *s,
487			   int size,
488			   const char *encoding,
489			   const char *errors)
490{
491    PyObject *buffer = NULL, *unicode;
492
493    if (encoding == NULL)
494	encoding = PyUnicode_GetDefaultEncoding();
495
496    /* Shortcuts for common default encodings */
497    if (strcmp(encoding, "utf-8") == 0)
498        return PyUnicode_DecodeUTF8(s, size, errors);
499    else if (strcmp(encoding, "latin-1") == 0)
500        return PyUnicode_DecodeLatin1(s, size, errors);
501    else if (strcmp(encoding, "ascii") == 0)
502        return PyUnicode_DecodeASCII(s, size, errors);
503
504    /* Decode via the codec registry */
505    buffer = PyBuffer_FromMemory((void *)s, size);
506    if (buffer == NULL)
507        goto onError;
508    unicode = PyCodec_Decode(buffer, encoding, errors);
509    if (unicode == NULL)
510        goto onError;
511    if (!PyUnicode_Check(unicode)) {
512        PyErr_Format(PyExc_TypeError,
513                     "decoder did not return an unicode object (type=%.400s)",
514                     unicode->ob_type->tp_name);
515        Py_DECREF(unicode);
516        goto onError;
517    }
518    Py_DECREF(buffer);
519    return unicode;
520
521 onError:
522    Py_XDECREF(buffer);
523    return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527			   int size,
528			   const char *encoding,
529			   const char *errors)
530{
531    PyObject *v, *unicode;
532
533    unicode = PyUnicode_FromUnicode(s, size);
534    if (unicode == NULL)
535	return NULL;
536    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537    Py_DECREF(unicode);
538    return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542                                    const char *encoding,
543                                    const char *errors)
544{
545    PyObject *v;
546
547    if (!PyUnicode_Check(unicode)) {
548        PyErr_BadArgument();
549        goto onError;
550    }
551
552    if (encoding == NULL)
553	encoding = PyUnicode_GetDefaultEncoding();
554
555    /* Shortcuts for common default encodings */
556    if (errors == NULL) {
557	if (strcmp(encoding, "utf-8") == 0)
558	    return PyUnicode_AsUTF8String(unicode);
559	else if (strcmp(encoding, "latin-1") == 0)
560	    return PyUnicode_AsLatin1String(unicode);
561	else if (strcmp(encoding, "ascii") == 0)
562	    return PyUnicode_AsASCIIString(unicode);
563    }
564
565    /* Encode via the codec registry */
566    v = PyCodec_Encode(unicode, encoding, errors);
567    if (v == NULL)
568        goto onError;
569    /* XXX Should we really enforce this ? */
570    if (!PyString_Check(v)) {
571        PyErr_Format(PyExc_TypeError,
572                     "encoder did not return a string object (type=%.400s)",
573                     v->ob_type->tp_name);
574        Py_DECREF(v);
575        goto onError;
576    }
577    return v;
578
579 onError:
580    return NULL;
581}
582
583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584					    const char *errors)
585{
586    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588    if (v)
589        return v;
590    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591    if (v && errors == NULL)
592        ((PyUnicodeObject *)unicode)->defenc = v;
593    return v;
594}
595
596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598    if (!PyUnicode_Check(unicode)) {
599        PyErr_BadArgument();
600        goto onError;
601    }
602    return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605    return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610    if (!PyUnicode_Check(unicode)) {
611        PyErr_BadArgument();
612        goto onError;
613    }
614    return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617    return -1;
618}
619
620const char *PyUnicode_GetDefaultEncoding(void)
621{
622    return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627    PyObject *v;
628
629    /* Make sure the encoding is valid. As side effect, this also
630       loads the encoding into the codec registry cache. */
631    v = _PyCodec_Lookup(encoding);
632    if (v == NULL)
633	goto onError;
634    Py_DECREF(v);
635    strncpy(unicode_default_encoding,
636	    encoding,
637	    sizeof(unicode_default_encoding));
638    return 0;
639
640 onError:
641    return -1;
642}
643
644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650    /* indicate whether a UTF-7 character is special i.e. cannot be directly
651       encoded:
652	   0 - not special
653	   1 - special
654	   2 - whitespace (optional)
655	   3 - RFC2152 Set O (optional) */
656    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668	(((c)>127 || utf7_special[(c)] == 1) || \
669	 (encodeWS && (utf7_special[(c)] == 2)) || \
670     (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675                        (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678    while (bits >= 6) { \
679        *out++ = B64(ch >> (bits-6)); \
680        bits -= 6; \
681    }
682
683#define DECODE(out, ch, bits, surrogate) \
684    while (bits >= 16) { \
685        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686        bits -= 16; \
687		if (surrogate) { \
688			/* We have already generated an error for the high surrogate
689               so let's not bother seeing if the low surrogate is correct or not */\
690			surrogate = 0; \
691		} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692            /* This is a surrogate pair. Unfortunately we can't represent \
693               it in a 16-bit character */ \
694			surrogate = 1; \
695            errmsg = "code pairs are not supported"; \
696	        goto utf7Error; \
697		} else { \
698				*out++ = outCh; \
699		} \
700    } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704                        const char *errors,
705                        const char *details)
706{
707    if ((errors == NULL) ||
708        (strcmp(errors,"strict") == 0)) {
709        PyErr_Format(PyExc_UnicodeError,
710                     "UTF-7 decoding error: %.400s",
711                     details);
712        return -1;
713    }
714    else if (strcmp(errors,"ignore") == 0) {
715        return 0;
716    }
717    else if (strcmp(errors,"replace") == 0) {
718        if (dest != NULL) {
719            **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720            (*dest)++;
721        }
722        return 0;
723    }
724    else {
725        PyErr_Format(PyExc_ValueError,
726                     "UTF-7 decoding error; unknown error handling code: %.400s",
727                     errors);
728        return -1;
729    }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733			       int size,
734			       const char *errors)
735{
736    const char *e;
737    PyUnicodeObject *unicode;
738    Py_UNICODE *p;
739    const char *errmsg = "";
740    int inShift = 0;
741    unsigned int bitsleft = 0;
742    unsigned long charsleft = 0;
743	int surrogate = 0;
744
745    unicode = _PyUnicode_New(size);
746    if (!unicode)
747        return NULL;
748    if (size == 0)
749        return (PyObject *)unicode;
750
751    p = unicode->str;
752    e = s + size;
753
754    while (s < e) {
755        Py_UNICODE ch = *s;
756
757        if (inShift) {
758            if ((ch == '-') || !B64CHAR(ch)) {
759                inShift = 0;
760                s++;
761
762                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763                if (bitsleft >= 6) {
764                    /* The shift sequence has a partial character in it. If
765                       bitsleft < 6 then we could just classify it as padding
766                       but that is not the case here */
767
768                    errmsg = "partial character in shift sequence";
769                    goto utf7Error;
770                }
771                /* According to RFC2152 the remaining bits should be zero. We
772                   choose to signal an error/insert a replacement character
773                   here so indicate the potential of a misencoded character. */
774
775                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777                    errmsg = "non-zero padding bits in shift sequence";
778                    goto utf7Error;
779                }
780
781                if (ch == '-') {
782                    if ((s < e) && (*(s) == '-')) {
783                        *p++ = '-';
784                        inShift = 1;
785                    }
786                } else if (SPECIAL(ch,0,0)) {
787                    errmsg = "unexpected special character";
788	                goto utf7Error;
789                } else  {
790                    *p++ = ch;
791                }
792            } else {
793                charsleft = (charsleft << 6) | UB64(ch);
794                bitsleft += 6;
795                s++;
796                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797            }
798        }
799        else if ( ch == '+' ) {
800            s++;
801            if (s < e && *s == '-') {
802                s++;
803                *p++ = '+';
804            } else
805            {
806                inShift = 1;
807                bitsleft = 0;
808            }
809        }
810        else if (SPECIAL(ch,0,0)) {
811            errmsg = "unexpected special character";
812            s++;
813	        goto utf7Error;
814        }
815        else {
816            *p++ = ch;
817            s++;
818        }
819        continue;
820    utf7Error:
821      if (utf7_decoding_error(&p, errors, errmsg))
822          goto onError;
823    }
824
825    if (inShift) {
826        if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827            goto onError;
828    }
829
830    if (_PyUnicode_Resize(&unicode, p - unicode->str))
831        goto onError;
832
833    return (PyObject *)unicode;
834
835onError:
836    Py_DECREF(unicode);
837    return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842                   int size,
843                   int encodeSetO,
844                   int encodeWhiteSpace,
845                   const char *errors)
846{
847    PyObject *v;
848    /* It might be possible to tighten this worst case */
849    unsigned int cbAllocated = 5 * size;
850    int inShift = 0;
851    int i = 0;
852    unsigned int bitsleft = 0;
853    unsigned long charsleft = 0;
854    char * out;
855    char * start;
856
857    if (size == 0)
858		return PyString_FromStringAndSize(NULL, 0);
859
860    v = PyString_FromStringAndSize(NULL, cbAllocated);
861    if (v == NULL)
862        return NULL;
863
864    start = out = PyString_AS_STRING(v);
865    for (;i < size; ++i) {
866        Py_UNICODE ch = s[i];
867
868        if (!inShift) {
869			if (ch == '+') {
870				*out++ = '+';
871                *out++ = '-';
872            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873                charsleft = ch;
874                bitsleft = 16;
875                *out++ = '+';
876				/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877                inShift = bitsleft > 0;
878			} else {
879				*out++ = (char) ch;
880			}
881		} else {
882            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883                *out++ = B64(charsleft << (6-bitsleft));
884                charsleft = 0;
885                bitsleft = 0;
886                /* Characters not in the BASE64 set implicitly unshift the sequence
887                   so no '-' is required, except if the character is itself a '-' */
888                if (B64CHAR(ch) || ch == '-') {
889                    *out++ = '-';
890                }
891                inShift = 0;
892                *out++ = (char) ch;
893            } else {
894                bitsleft += 16;
895                charsleft = (charsleft << 16) | ch;
896                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898                /* If the next character is special then we dont' need to terminate
899                   the shift sequence. If the next character is not a BASE64 character
900                   or '-' then the shift sequence will be terminated implicitly and we
901                   don't have to insert a '-'. */
902
903                if (bitsleft == 0) {
904                    if (i + 1 < size) {
905                        Py_UNICODE ch2 = s[i+1];
906
907                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909                        } else if (B64CHAR(ch2) || ch2 == '-') {
910                            *out++ = '-';
911                            inShift = 0;
912                        } else {
913                            inShift = 0;
914                        }
915
916                    }
917                    else {
918                        *out++ = '-';
919                        inShift = 0;
920                    }
921                }
922            }
923        }
924	}
925    if (bitsleft) {
926        *out++= B64(charsleft << (6-bitsleft) );
927        *out++ = '-';
928    }
929
930    _PyString_Resize(&v, out - start);
931    return v;
932}
933
934#undef SPECIAL
935#undef B64
936#undef B64CHAR
937#undef UB64
938#undef ENCODE
939#undef DECODE
940
941/* --- UTF-8 Codec -------------------------------------------------------- */
942
943static
944char utf8_code_length[256] = {
945    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
946       illegal prefix.  see RFC 2279 for details */
947    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
948    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
949    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
950    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
957    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
958    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
960    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
961    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
962    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
963};
964
965static
966int utf8_decoding_error(const char **source,
967                        Py_UNICODE **dest,
968                        const char *errors,
969                        const char *details)
970{
971    if ((errors == NULL) ||
972        (strcmp(errors,"strict") == 0)) {
973        PyErr_Format(PyExc_UnicodeError,
974                     "UTF-8 decoding error: %.400s",
975                     details);
976        return -1;
977    }
978    else if (strcmp(errors,"ignore") == 0) {
979        (*source)++;
980        return 0;
981    }
982    else if (strcmp(errors,"replace") == 0) {
983        (*source)++;
984        **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
985        (*dest)++;
986        return 0;
987    }
988    else {
989        PyErr_Format(PyExc_ValueError,
990                     "UTF-8 decoding error; unknown error handling code: %.400s",
991                     errors);
992        return -1;
993    }
994}
995
996PyObject *PyUnicode_DecodeUTF8(const char *s,
997			       int size,
998			       const char *errors)
999{
1000    int n;
1001    const char *e;
1002    PyUnicodeObject *unicode;
1003    Py_UNICODE *p;
1004    const char *errmsg = "";
1005
1006    /* Note: size will always be longer than the resulting Unicode
1007       character count */
1008    unicode = _PyUnicode_New(size);
1009    if (!unicode)
1010        return NULL;
1011    if (size == 0)
1012        return (PyObject *)unicode;
1013
1014    /* Unpack UTF-8 encoded data */
1015    p = unicode->str;
1016    e = s + size;
1017
1018    while (s < e) {
1019        Py_UCS4 ch = (unsigned char)*s;
1020
1021        if (ch < 0x80) {
1022            *p++ = (Py_UNICODE)ch;
1023            s++;
1024            continue;
1025        }
1026
1027        n = utf8_code_length[ch];
1028
1029        if (s + n > e) {
1030	    errmsg = "unexpected end of data";
1031	    goto utf8Error;
1032	}
1033
1034        switch (n) {
1035
1036        case 0:
1037            errmsg = "unexpected code byte";
1038	    goto utf8Error;
1039
1040        case 1:
1041            errmsg = "internal error";
1042	    goto utf8Error;
1043
1044        case 2:
1045            if ((s[1] & 0xc0) != 0x80) {
1046                errmsg = "invalid data";
1047		goto utf8Error;
1048	    }
1049            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1050            if (ch < 0x80) {
1051                errmsg = "illegal encoding";
1052		goto utf8Error;
1053	    }
1054	    else
1055		*p++ = (Py_UNICODE)ch;
1056            break;
1057
1058        case 3:
1059            if ((s[1] & 0xc0) != 0x80 ||
1060                (s[2] & 0xc0) != 0x80) {
1061                errmsg = "invalid data";
1062		goto utf8Error;
1063	    }
1064            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1065            if (ch < 0x0800) {
1066		/* Note: UTF-8 encodings of surrogates are considered
1067		   legal UTF-8 sequences;
1068
1069		   XXX For wide builds (UCS-4) we should probably try
1070		       to recombine the surrogates into a single code
1071		       unit.
1072		*/
1073                errmsg = "illegal encoding";
1074		goto utf8Error;
1075	    }
1076	    else
1077		*p++ = (Py_UNICODE)ch;
1078            break;
1079
1080        case 4:
1081            if ((s[1] & 0xc0) != 0x80 ||
1082                (s[2] & 0xc0) != 0x80 ||
1083                (s[3] & 0xc0) != 0x80) {
1084                errmsg = "invalid data";
1085		goto utf8Error;
1086	    }
1087            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1088                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1089            /* validate and convert to UTF-16 */
1090            if ((ch < 0x10000)        /* minimum value allowed for 4
1091					 byte encoding */
1092                || (ch > 0x10ffff))   /* maximum value allowed for
1093					 UTF-16 */
1094	    {
1095                errmsg = "illegal encoding";
1096		goto utf8Error;
1097	    }
1098#ifdef Py_UNICODE_WIDE
1099	    *p++ = (Py_UNICODE)ch;
1100#else
1101            /*  compute and append the two surrogates: */
1102
1103            /*  translate from 10000..10FFFF to 0..FFFF */
1104            ch -= 0x10000;
1105
1106            /*  high surrogate = top 10 bits added to D800 */
1107            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1108
1109            /*  low surrogate = bottom 10 bits added to DC00 */
1110            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1111#endif
1112            break;
1113
1114        default:
1115            /* Other sizes are only needed for UCS-4 */
1116            errmsg = "unsupported Unicode code range";
1117	    goto utf8Error;
1118        }
1119        s += n;
1120	continue;
1121
1122    utf8Error:
1123      if (utf8_decoding_error(&s, &p, errors, errmsg))
1124          goto onError;
1125    }
1126
1127    /* Adjust length */
1128    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1129        goto onError;
1130
1131    return (PyObject *)unicode;
1132
1133onError:
1134    Py_DECREF(unicode);
1135    return NULL;
1136}
1137
1138/* Allocation strategy:  if the string is short, convert into a stack buffer
1139   and allocate exactly as much space needed at the end.  Else allocate the
1140   maximum possible needed (4 result bytes per Unicode character), and return
1141   the excess memory at the end.
1142*/
1143PyObject *
1144PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1145		     int size,
1146		     const char *errors)
1147{
1148#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1149
1150    int i;              /* index into s of next input byte */
1151    PyObject *v;        /* result string object */
1152    char *p;            /* next free byte in output buffer */
1153    int nallocated;     /* number of result bytes allocated */
1154    int nneeded;        /* number of result bytes needed */
1155    char stackbuf[MAX_SHORT_UNICHARS * 4];
1156
1157    assert(s != NULL);
1158    assert(size >= 0);
1159
1160    if (size <= MAX_SHORT_UNICHARS) {
1161        /* Write into the stack buffer; nallocated can't overflow.
1162         * At the end, we'll allocate exactly as much heap space as it
1163         * turns out we need.
1164         */
1165        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1166        v = NULL;   /* will allocate after we're done */
1167        p = stackbuf;
1168    }
1169    else {
1170        /* Overallocate on the heap, and give the excess back at the end. */
1171        nallocated = size * 4;
1172        if (nallocated / 4 != size)  /* overflow! */
1173            return PyErr_NoMemory();
1174        v = PyString_FromStringAndSize(NULL, nallocated);
1175        if (v == NULL)
1176            return NULL;
1177        p = PyString_AS_STRING(v);
1178    }
1179
1180    for (i = 0; i < size;) {
1181        Py_UCS4 ch = s[i++];
1182
1183        if (ch < 0x80)
1184            /* Encode ASCII */
1185            *p++ = (char) ch;
1186
1187        else if (ch < 0x0800) {
1188            /* Encode Latin-1 */
1189            *p++ = (char)(0xc0 | (ch >> 6));
1190            *p++ = (char)(0x80 | (ch & 0x3f));
1191        }
1192        else {
1193            /* Encode UCS2 Unicode ordinals */
1194            if (ch < 0x10000) {
1195                /* Special case: check for high surrogate */
1196                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1197                    Py_UCS4 ch2 = s[i];
1198                    /* Check for low surrogate and combine the two to
1199                       form a UCS4 value */
1200                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1201                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1202                        i++;
1203                        goto encodeUCS4;
1204                    }
1205                    /* Fall through: handles isolated high surrogates */
1206                }
1207                *p++ = (char)(0xe0 | (ch >> 12));
1208                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1209                *p++ = (char)(0x80 | (ch & 0x3f));
1210                continue;
1211    	    }
1212encodeUCS4:
1213            /* Encode UCS4 Unicode ordinals */
1214            *p++ = (char)(0xf0 | (ch >> 18));
1215            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1216            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1217            *p++ = (char)(0x80 | (ch & 0x3f));
1218        }
1219    }
1220
1221    if (v == NULL) {
1222        /* This was stack allocated. */
1223        nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1224        assert(nneeded <= nallocated);
1225        v = PyString_FromStringAndSize(stackbuf, nneeded);
1226    }
1227    else {
1228    	/* Cut back to size actually needed. */
1229        nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1230        assert(nneeded <= nallocated);
1231        _PyString_Resize(&v, nneeded);
1232    }
1233    return v;
1234
1235#undef MAX_SHORT_UNICHARS
1236}
1237
1238PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1239{
1240    if (!PyUnicode_Check(unicode)) {
1241        PyErr_BadArgument();
1242        return NULL;
1243    }
1244    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1245				PyUnicode_GET_SIZE(unicode),
1246				NULL);
1247}
1248
1249/* --- UTF-16 Codec ------------------------------------------------------- */
1250
1251static
1252int utf16_decoding_error(Py_UNICODE **dest,
1253			 const char *errors,
1254			 const char *details)
1255{
1256    if ((errors == NULL) ||
1257        (strcmp(errors,"strict") == 0)) {
1258        PyErr_Format(PyExc_UnicodeError,
1259                     "UTF-16 decoding error: %.400s",
1260                     details);
1261        return -1;
1262    }
1263    else if (strcmp(errors,"ignore") == 0) {
1264        return 0;
1265    }
1266    else if (strcmp(errors,"replace") == 0) {
1267	if (dest) {
1268	    **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1269	    (*dest)++;
1270	}
1271        return 0;
1272    }
1273    else {
1274        PyErr_Format(PyExc_ValueError,
1275                     "UTF-16 decoding error; "
1276		     "unknown error handling code: %.400s",
1277                     errors);
1278        return -1;
1279    }
1280}
1281
1282PyObject *
1283PyUnicode_DecodeUTF16(const char *s,
1284		      int size,
1285		      const char *errors,
1286		      int *byteorder)
1287{
1288    PyUnicodeObject *unicode;
1289    Py_UNICODE *p;
1290    const unsigned char *q, *e;
1291    int bo = 0;       /* assume native ordering by default */
1292    const char *errmsg = "";
1293    /* Offsets from q for retrieving byte pairs in the right order. */
1294#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1295    int ihi = 1, ilo = 0;
1296#else
1297    int ihi = 0, ilo = 1;
1298#endif
1299
1300    /* size should be an even number */
1301    if (size & 1) {
1302        if (utf16_decoding_error(NULL, errors, "truncated data"))
1303            return NULL;
1304        --size;  /* else ignore the oddball byte */
1305    }
1306
1307    /* Note: size will always be longer than the resulting Unicode
1308       character count */
1309    unicode = _PyUnicode_New(size);
1310    if (!unicode)
1311        return NULL;
1312    if (size == 0)
1313        return (PyObject *)unicode;
1314
1315    /* Unpack UTF-16 encoded data */
1316    p = unicode->str;
1317    q = (unsigned char *)s;
1318    e = q + size;
1319
1320    if (byteorder)
1321        bo = *byteorder;
1322
1323    /* Check for BOM marks (U+FEFF) in the input and adjust current
1324       byte order setting accordingly. In native mode, the leading BOM
1325       mark is skipped, in all other modes, it is copied to the output
1326       stream as-is (giving a ZWNBSP character). */
1327    if (bo == 0) {
1328        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1329#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1330	if (bom == 0xFEFF) {
1331	    q += 2;
1332	    bo = -1;
1333	}
1334        else if (bom == 0xFFFE) {
1335	    q += 2;
1336	    bo = 1;
1337	}
1338#else
1339	if (bom == 0xFEFF) {
1340	    q += 2;
1341	    bo = 1;
1342	}
1343        else if (bom == 0xFFFE) {
1344	    q += 2;
1345	    bo = -1;
1346	}
1347#endif
1348    }
1349
1350    if (bo == -1) {
1351        /* force LE */
1352        ihi = 1;
1353        ilo = 0;
1354    }
1355    else if (bo == 1) {
1356        /* force BE */
1357        ihi = 0;
1358        ilo = 1;
1359    }
1360
1361    while (q < e) {
1362	Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1363	q += 2;
1364
1365	if (ch < 0xD800 || ch > 0xDFFF) {
1366	    *p++ = ch;
1367	    continue;
1368	}
1369
1370	/* UTF-16 code pair: */
1371	if (q >= e) {
1372	    errmsg = "unexpected end of data";
1373	    goto utf16Error;
1374	}
1375	if (0xD800 <= ch && ch <= 0xDBFF) {
1376	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1377	    q += 2;
1378	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1379#ifndef Py_UNICODE_WIDE
1380		*p++ = ch;
1381		*p++ = ch2;
1382#else
1383		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1384#endif
1385		continue;
1386	    }
1387	    else {
1388                errmsg = "illegal UTF-16 surrogate";
1389		goto utf16Error;
1390	    }
1391
1392	}
1393	errmsg = "illegal encoding";
1394	/* Fall through to report the error */
1395
1396    utf16Error:
1397	if (utf16_decoding_error(&p, errors, errmsg))
1398	    goto onError;
1399    }
1400
1401    if (byteorder)
1402        *byteorder = bo;
1403
1404    /* Adjust length */
1405    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1406        goto onError;
1407
1408    return (PyObject *)unicode;
1409
1410onError:
1411    Py_DECREF(unicode);
1412    return NULL;
1413}
1414
1415PyObject *
1416PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1417		      int size,
1418		      const char *errors,
1419		      int byteorder)
1420{
1421    PyObject *v;
1422    unsigned char *p;
1423    int i, pairs;
1424    /* Offsets from p for storing byte pairs in the right order. */
1425#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1426    int ihi = 1, ilo = 0;
1427#else
1428    int ihi = 0, ilo = 1;
1429#endif
1430
1431#define STORECHAR(CH)                   \
1432    do {                                \
1433        p[ihi] = ((CH) >> 8) & 0xff;    \
1434        p[ilo] = (CH) & 0xff;           \
1435        p += 2;                         \
1436    } while(0)
1437
1438    for (i = pairs = 0; i < size; i++)
1439	if (s[i] >= 0x10000)
1440	    pairs++;
1441    v = PyString_FromStringAndSize(NULL,
1442		  2 * (size + pairs + (byteorder == 0)));
1443    if (v == NULL)
1444        return NULL;
1445
1446    p = (unsigned char *)PyString_AS_STRING(v);
1447    if (byteorder == 0)
1448	STORECHAR(0xFEFF);
1449    if (size == 0)
1450        return v;
1451
1452    if (byteorder == -1) {
1453        /* force LE */
1454        ihi = 1;
1455        ilo = 0;
1456    }
1457    else if (byteorder == 1) {
1458        /* force BE */
1459        ihi = 0;
1460        ilo = 1;
1461    }
1462
1463    while (size-- > 0) {
1464	Py_UNICODE ch = *s++;
1465	Py_UNICODE ch2 = 0;
1466	if (ch >= 0x10000) {
1467	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1468	    ch  = 0xD800 | ((ch-0x10000) >> 10);
1469	}
1470        STORECHAR(ch);
1471        if (ch2)
1472            STORECHAR(ch2);
1473    }
1474    return v;
1475#undef STORECHAR
1476}
1477
1478PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1479{
1480    if (!PyUnicode_Check(unicode)) {
1481        PyErr_BadArgument();
1482        return NULL;
1483    }
1484    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1485				 PyUnicode_GET_SIZE(unicode),
1486				 NULL,
1487				 0);
1488}
1489
1490/* --- Unicode Escape Codec ----------------------------------------------- */
1491
1492static
1493int unicodeescape_decoding_error(Py_UNICODE **x,
1494                                 const char *errors,
1495                                 const char *details)
1496{
1497    if ((errors == NULL) ||
1498        (strcmp(errors,"strict") == 0)) {
1499        PyErr_Format(PyExc_UnicodeError,
1500                     "Unicode-Escape decoding error: %.400s",
1501                     details);
1502        return -1;
1503    }
1504    else if (strcmp(errors,"ignore") == 0) {
1505        return 0;
1506    }
1507    else if (strcmp(errors,"replace") == 0) {
1508        **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1509	(*x)++;
1510        return 0;
1511    }
1512    else {
1513        PyErr_Format(PyExc_ValueError,
1514                     "Unicode-Escape decoding error; "
1515                     "unknown error handling code: %.400s",
1516                     errors);
1517        return -1;
1518    }
1519}
1520
1521static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1522
1523PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1524					int size,
1525					const char *errors)
1526{
1527    PyUnicodeObject *v;
1528    Py_UNICODE *p, *buf;
1529    const char *end;
1530    char* message;
1531    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1532
1533    /* Escaped strings will always be longer than the resulting
1534       Unicode string, so we start with size here and then reduce the
1535       length after conversion to the true value. */
1536    v = _PyUnicode_New(size);
1537    if (v == NULL)
1538        goto onError;
1539    if (size == 0)
1540        return (PyObject *)v;
1541
1542    p = buf = PyUnicode_AS_UNICODE(v);
1543    end = s + size;
1544
1545    while (s < end) {
1546        unsigned char c;
1547        Py_UNICODE x;
1548        int i, digits;
1549
1550        /* Non-escape characters are interpreted as Unicode ordinals */
1551        if (*s != '\\') {
1552            *p++ = (unsigned char) *s++;
1553            continue;
1554        }
1555
1556        /* \ - Escapes */
1557        s++;
1558        switch (*s++) {
1559
1560        /* \x escapes */
1561        case '\n': break;
1562        case '\\': *p++ = '\\'; break;
1563        case '\'': *p++ = '\''; break;
1564        case '\"': *p++ = '\"'; break;
1565        case 'b': *p++ = '\b'; break;
1566        case 'f': *p++ = '\014'; break; /* FF */
1567        case 't': *p++ = '\t'; break;
1568        case 'n': *p++ = '\n'; break;
1569        case 'r': *p++ = '\r'; break;
1570        case 'v': *p++ = '\013'; break; /* VT */
1571        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1572
1573        /* \OOO (octal) escapes */
1574        case '0': case '1': case '2': case '3':
1575        case '4': case '5': case '6': case '7':
1576            x = s[-1] - '0';
1577            if ('0' <= *s && *s <= '7') {
1578                x = (x<<3) + *s++ - '0';
1579                if ('0' <= *s && *s <= '7')
1580                    x = (x<<3) + *s++ - '0';
1581            }
1582            *p++ = x;
1583            break;
1584
1585        /* hex escapes */
1586        /* \xXX */
1587        case 'x':
1588            digits = 2;
1589            message = "truncated \\xXX escape";
1590            goto hexescape;
1591
1592        /* \uXXXX */
1593        case 'u':
1594            digits = 4;
1595            message = "truncated \\uXXXX escape";
1596            goto hexescape;
1597
1598        /* \UXXXXXXXX */
1599        case 'U':
1600            digits = 8;
1601            message = "truncated \\UXXXXXXXX escape";
1602        hexescape:
1603            chr = 0;
1604            for (i = 0; i < digits; i++) {
1605                c = (unsigned char) s[i];
1606                if (!isxdigit(c)) {
1607                    if (unicodeescape_decoding_error(&p, errors, message))
1608                        goto onError;
1609                    chr = 0xffffffff;
1610                    i++;
1611                    break;
1612                }
1613                chr = (chr<<4) & ~0xF;
1614                if (c >= '0' && c <= '9')
1615                    chr += c - '0';
1616                else if (c >= 'a' && c <= 'f')
1617                    chr += 10 + c - 'a';
1618                else
1619                    chr += 10 + c - 'A';
1620            }
1621            s += i;
1622            if (chr == 0xffffffff)
1623                    /* _decoding_error will have already written into the
1624                       target buffer. */
1625                    break;
1626        store:
1627            /* when we get here, chr is a 32-bit unicode character */
1628            if (chr <= 0xffff)
1629                /* UCS-2 character */
1630                *p++ = (Py_UNICODE) chr;
1631            else if (chr <= 0x10ffff) {
1632                /* UCS-4 character. Either store directly, or as
1633                   surrogate pair. */
1634#ifdef Py_UNICODE_WIDE
1635                *p++ = chr;
1636#else
1637                chr -= 0x10000L;
1638                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1639                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1640#endif
1641            } else {
1642                if (unicodeescape_decoding_error(
1643                    &p, errors,
1644                    "illegal Unicode character")
1645                    )
1646                    goto onError;
1647            }
1648            break;
1649
1650        /* \N{name} */
1651        case 'N':
1652            message = "malformed \\N character escape";
1653            if (ucnhash_CAPI == NULL) {
1654                /* load the unicode data module */
1655                PyObject *m, *v;
1656                m = PyImport_ImportModule("unicodedata");
1657                if (m == NULL)
1658                    goto ucnhashError;
1659                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1660                Py_DECREF(m);
1661                if (v == NULL)
1662                    goto ucnhashError;
1663                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1664                Py_DECREF(v);
1665                if (ucnhash_CAPI == NULL)
1666                    goto ucnhashError;
1667            }
1668            if (*s == '{') {
1669                const char *start = s+1;
1670                /* look for the closing brace */
1671                while (*s != '}' && s < end)
1672                    s++;
1673                if (s > start && s < end && *s == '}') {
1674                    /* found a name.  look it up in the unicode database */
1675                    message = "unknown Unicode character name";
1676                    s++;
1677                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1678                        goto store;
1679                }
1680            }
1681            if (unicodeescape_decoding_error(&p, errors, message))
1682                goto onError;
1683            break;
1684
1685        default:
1686            if (s > end) {
1687                if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1688                    goto onError;
1689            }
1690            else {
1691                *p++ = '\\';
1692                *p++ = (unsigned char)s[-1];
1693            }
1694            break;
1695        }
1696    }
1697    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1698                goto onError;
1699    return (PyObject *)v;
1700
1701ucnhashError:
1702    PyErr_SetString(
1703        PyExc_UnicodeError,
1704        "\\N escapes not supported (can't load unicodedata module)"
1705        );
1706    return NULL;
1707
1708onError:
1709    Py_XDECREF(v);
1710    return NULL;
1711}
1712
1713/* Return a Unicode-Escape string version of the Unicode object.
1714
1715   If quotes is true, the string is enclosed in u"" or u'' quotes as
1716   appropriate.
1717
1718*/
1719
1720static const Py_UNICODE *findchar(const Py_UNICODE *s,
1721				  int size,
1722				  Py_UNICODE ch);
1723
1724static
1725PyObject *unicodeescape_string(const Py_UNICODE *s,
1726                               int size,
1727                               int quotes)
1728{
1729    PyObject *repr;
1730    char *p;
1731
1732    static const char *hexdigit = "0123456789abcdef";
1733
1734    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1735    if (repr == NULL)
1736        return NULL;
1737
1738    p = PyString_AS_STRING(repr);
1739
1740    if (quotes) {
1741        *p++ = 'u';
1742        *p++ = (findchar(s, size, '\'') &&
1743                !findchar(s, size, '"')) ? '"' : '\'';
1744    }
1745    while (size-- > 0) {
1746        Py_UNICODE ch = *s++;
1747
1748        /* Escape quotes */
1749        if (quotes &&
1750	    (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1751            *p++ = '\\';
1752            *p++ = (char) ch;
1753	    continue;
1754        }
1755
1756#ifdef Py_UNICODE_WIDE
1757        /* Map 21-bit characters to '\U00xxxxxx' */
1758        else if (ch >= 0x10000) {
1759	    int offset = p - PyString_AS_STRING(repr);
1760
1761	    /* Resize the string if necessary */
1762	    if (offset + 12 > PyString_GET_SIZE(repr)) {
1763		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1764		    return NULL;
1765		p = PyString_AS_STRING(repr) + offset;
1766	    }
1767
1768            *p++ = '\\';
1769            *p++ = 'U';
1770            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1771            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1772            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1773            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1774            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1775            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1776            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1777            *p++ = hexdigit[ch & 0x0000000F];
1778	    continue;
1779        }
1780#endif
1781	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1782	else if (ch >= 0xD800 && ch < 0xDC00) {
1783	    Py_UNICODE ch2;
1784	    Py_UCS4 ucs;
1785
1786	    ch2 = *s++;
1787	    size--;
1788	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1789		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1790		*p++ = '\\';
1791		*p++ = 'U';
1792		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1793		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1794		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1795		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1796		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1797		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1798		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1799		*p++ = hexdigit[ucs & 0x0000000F];
1800		continue;
1801	    }
1802	    /* Fall through: isolated surrogates are copied as-is */
1803	    s--;
1804	    size++;
1805	}
1806
1807        /* Map 16-bit characters to '\uxxxx' */
1808        if (ch >= 256) {
1809            *p++ = '\\';
1810            *p++ = 'u';
1811            *p++ = hexdigit[(ch >> 12) & 0x000F];
1812            *p++ = hexdigit[(ch >> 8) & 0x000F];
1813            *p++ = hexdigit[(ch >> 4) & 0x000F];
1814            *p++ = hexdigit[ch & 0x000F];
1815        }
1816
1817        /* Map special whitespace to '\t', \n', '\r' */
1818        else if (ch == '\t') {
1819            *p++ = '\\';
1820            *p++ = 't';
1821        }
1822        else if (ch == '\n') {
1823            *p++ = '\\';
1824            *p++ = 'n';
1825        }
1826        else if (ch == '\r') {
1827            *p++ = '\\';
1828            *p++ = 'r';
1829        }
1830
1831        /* Map non-printable US ASCII to '\xhh' */
1832        else if (ch < ' ' || ch >= 0x7F) {
1833            *p++ = '\\';
1834            *p++ = 'x';
1835            *p++ = hexdigit[(ch >> 4) & 0x000F];
1836            *p++ = hexdigit[ch & 0x000F];
1837        }
1838
1839        /* Copy everything else as-is */
1840        else
1841            *p++ = (char) ch;
1842    }
1843    if (quotes)
1844        *p++ = PyString_AS_STRING(repr)[1];
1845
1846    *p = '\0';
1847    _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
1848    return repr;
1849}
1850
1851PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1852					int size)
1853{
1854    return unicodeescape_string(s, size, 0);
1855}
1856
1857PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1858{
1859    if (!PyUnicode_Check(unicode)) {
1860        PyErr_BadArgument();
1861        return NULL;
1862    }
1863    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1864					 PyUnicode_GET_SIZE(unicode));
1865}
1866
1867/* --- Raw Unicode Escape Codec ------------------------------------------- */
1868
1869PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1870					   int size,
1871					   const char *errors)
1872{
1873    PyUnicodeObject *v;
1874    Py_UNICODE *p, *buf;
1875    const char *end;
1876    const char *bs;
1877
1878    /* Escaped strings will always be longer than the resulting
1879       Unicode string, so we start with size here and then reduce the
1880       length after conversion to the true value. */
1881    v = _PyUnicode_New(size);
1882    if (v == NULL)
1883	goto onError;
1884    if (size == 0)
1885	return (PyObject *)v;
1886    p = buf = PyUnicode_AS_UNICODE(v);
1887    end = s + size;
1888    while (s < end) {
1889	unsigned char c;
1890	Py_UCS4 x;
1891	int i;
1892
1893	/* Non-escape characters are interpreted as Unicode ordinals */
1894	if (*s != '\\') {
1895	    *p++ = (unsigned char)*s++;
1896	    continue;
1897	}
1898
1899	/* \u-escapes are only interpreted iff the number of leading
1900	   backslashes if odd */
1901	bs = s;
1902	for (;s < end;) {
1903	    if (*s != '\\')
1904		break;
1905	    *p++ = (unsigned char)*s++;
1906	}
1907	if (((s - bs) & 1) == 0 ||
1908	    s >= end ||
1909	    *s != 'u') {
1910	    continue;
1911	}
1912	p--;
1913	s++;
1914
1915	/* \uXXXX with 4 hex digits */
1916	for (x = 0, i = 0; i < 4; i++) {
1917	    c = (unsigned char)s[i];
1918	    if (!isxdigit(c)) {
1919		if (unicodeescape_decoding_error(&p, errors,
1920						 "truncated \\uXXXX"))
1921		    goto onError;
1922		x = 0xffffffff;
1923		i++;
1924		break;
1925	    }
1926	    x = (x<<4) & ~0xF;
1927	    if (c >= '0' && c <= '9')
1928		x += c - '0';
1929	    else if (c >= 'a' && c <= 'f')
1930		x += 10 + c - 'a';
1931	    else
1932		x += 10 + c - 'A';
1933	}
1934	s += i;
1935	if (x != 0xffffffff)
1936		*p++ = x;
1937    }
1938    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1939	goto onError;
1940    return (PyObject *)v;
1941
1942 onError:
1943    Py_XDECREF(v);
1944    return NULL;
1945}
1946
1947PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1948					   int size)
1949{
1950    PyObject *repr;
1951    char *p;
1952    char *q;
1953
1954    static const char *hexdigit = "0123456789abcdef";
1955
1956    repr = PyString_FromStringAndSize(NULL, 6 * size);
1957    if (repr == NULL)
1958        return NULL;
1959    if (size == 0)
1960	return repr;
1961
1962    p = q = PyString_AS_STRING(repr);
1963    while (size-- > 0) {
1964        Py_UNICODE ch = *s++;
1965	/* Map 16-bit characters to '\uxxxx' */
1966	if (ch >= 256) {
1967            *p++ = '\\';
1968            *p++ = 'u';
1969            *p++ = hexdigit[(ch >> 12) & 0xf];
1970            *p++ = hexdigit[(ch >> 8) & 0xf];
1971            *p++ = hexdigit[(ch >> 4) & 0xf];
1972            *p++ = hexdigit[ch & 15];
1973        }
1974	/* Copy everything else as-is */
1975	else
1976            *p++ = (char) ch;
1977    }
1978    *p = '\0';
1979    _PyString_Resize(&repr, p - q);
1980    return repr;
1981}
1982
1983PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1984{
1985    if (!PyUnicode_Check(unicode)) {
1986	PyErr_BadArgument();
1987	return NULL;
1988    }
1989    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1990					    PyUnicode_GET_SIZE(unicode));
1991}
1992
1993/* --- Latin-1 Codec ------------------------------------------------------ */
1994
1995PyObject *PyUnicode_DecodeLatin1(const char *s,
1996				 int size,
1997				 const char *errors)
1998{
1999    PyUnicodeObject *v;
2000    Py_UNICODE *p;
2001
2002    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2003    if (size == 1 && *(unsigned char*)s < 256) {
2004	Py_UNICODE r = *(unsigned char*)s;
2005	return PyUnicode_FromUnicode(&r, 1);
2006    }
2007
2008    v = _PyUnicode_New(size);
2009    if (v == NULL)
2010	goto onError;
2011    if (size == 0)
2012	return (PyObject *)v;
2013    p = PyUnicode_AS_UNICODE(v);
2014    while (size-- > 0)
2015	*p++ = (unsigned char)*s++;
2016    return (PyObject *)v;
2017
2018 onError:
2019    Py_XDECREF(v);
2020    return NULL;
2021}
2022
2023static
2024int latin1_encoding_error(const Py_UNICODE **source,
2025			  char **dest,
2026			  const char *errors,
2027			  const char *details)
2028{
2029    if ((errors == NULL) ||
2030	(strcmp(errors,"strict") == 0)) {
2031	PyErr_Format(PyExc_UnicodeError,
2032		     "Latin-1 encoding error: %.400s",
2033		     details);
2034	return -1;
2035    }
2036    else if (strcmp(errors,"ignore") == 0) {
2037	return 0;
2038    }
2039    else if (strcmp(errors,"replace") == 0) {
2040	**dest = '?';
2041	(*dest)++;
2042	return 0;
2043    }
2044    else {
2045	PyErr_Format(PyExc_ValueError,
2046		     "Latin-1 encoding error; "
2047		     "unknown error handling code: %.400s",
2048		     errors);
2049	return -1;
2050    }
2051}
2052
2053PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2054				 int size,
2055				 const char *errors)
2056{
2057    PyObject *repr;
2058    char *s, *start;
2059
2060    repr = PyString_FromStringAndSize(NULL, size);
2061    if (repr == NULL)
2062        return NULL;
2063    if (size == 0)
2064	return repr;
2065
2066    s = PyString_AS_STRING(repr);
2067    start = s;
2068    while (size-- > 0) {
2069        Py_UNICODE ch = *p++;
2070	if (ch >= 256) {
2071	    if (latin1_encoding_error(&p, &s, errors,
2072				      "ordinal not in range(256)"))
2073		goto onError;
2074	}
2075	else
2076            *s++ = (char)ch;
2077    }
2078    /* Resize if error handling skipped some characters */
2079    if (s - start < PyString_GET_SIZE(repr))
2080	_PyString_Resize(&repr, s - start);
2081    return repr;
2082
2083 onError:
2084    Py_DECREF(repr);
2085    return NULL;
2086}
2087
2088PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2089{
2090    if (!PyUnicode_Check(unicode)) {
2091	PyErr_BadArgument();
2092	return NULL;
2093    }
2094    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2095				  PyUnicode_GET_SIZE(unicode),
2096				  NULL);
2097}
2098
2099/* --- 7-bit ASCII Codec -------------------------------------------------- */
2100
2101static
2102int ascii_decoding_error(const char **source,
2103			 Py_UNICODE **dest,
2104			 const char *errors,
2105			 const char *details)
2106{
2107    if ((errors == NULL) ||
2108	(strcmp(errors,"strict") == 0)) {
2109	PyErr_Format(PyExc_UnicodeError,
2110		     "ASCII decoding error: %.400s",
2111		     details);
2112	return -1;
2113    }
2114    else if (strcmp(errors,"ignore") == 0) {
2115	return 0;
2116    }
2117    else if (strcmp(errors,"replace") == 0) {
2118	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2119	(*dest)++;
2120	return 0;
2121    }
2122    else {
2123	PyErr_Format(PyExc_ValueError,
2124		     "ASCII decoding error; "
2125		     "unknown error handling code: %.400s",
2126		     errors);
2127	return -1;
2128    }
2129}
2130
2131PyObject *PyUnicode_DecodeASCII(const char *s,
2132				int size,
2133				const char *errors)
2134{
2135    PyUnicodeObject *v;
2136    Py_UNICODE *p;
2137
2138    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2139    if (size == 1 && *(unsigned char*)s < 128) {
2140	Py_UNICODE r = *(unsigned char*)s;
2141	return PyUnicode_FromUnicode(&r, 1);
2142    }
2143
2144    v = _PyUnicode_New(size);
2145    if (v == NULL)
2146	goto onError;
2147    if (size == 0)
2148	return (PyObject *)v;
2149    p = PyUnicode_AS_UNICODE(v);
2150    while (size-- > 0) {
2151	register unsigned char c;
2152
2153	c = (unsigned char)*s++;
2154	if (c < 128)
2155	    *p++ = c;
2156	else if (ascii_decoding_error(&s, &p, errors,
2157				      "ordinal not in range(128)"))
2158		goto onError;
2159    }
2160    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2161	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2162	    goto onError;
2163    return (PyObject *)v;
2164
2165 onError:
2166    Py_XDECREF(v);
2167    return NULL;
2168}
2169
2170static
2171int ascii_encoding_error(const Py_UNICODE **source,
2172			 char **dest,
2173			 const char *errors,
2174			 const char *details)
2175{
2176    if ((errors == NULL) ||
2177	(strcmp(errors,"strict") == 0)) {
2178	PyErr_Format(PyExc_UnicodeError,
2179		     "ASCII encoding error: %.400s",
2180		     details);
2181	return -1;
2182    }
2183    else if (strcmp(errors,"ignore") == 0) {
2184	return 0;
2185    }
2186    else if (strcmp(errors,"replace") == 0) {
2187	**dest = '?';
2188	(*dest)++;
2189	return 0;
2190    }
2191    else {
2192	PyErr_Format(PyExc_ValueError,
2193		     "ASCII encoding error; "
2194		     "unknown error handling code: %.400s",
2195		     errors);
2196	return -1;
2197    }
2198}
2199
2200PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2201				int size,
2202				const char *errors)
2203{
2204    PyObject *repr;
2205    char *s, *start;
2206
2207    repr = PyString_FromStringAndSize(NULL, size);
2208    if (repr == NULL)
2209        return NULL;
2210    if (size == 0)
2211	return repr;
2212
2213    s = PyString_AS_STRING(repr);
2214    start = s;
2215    while (size-- > 0) {
2216        Py_UNICODE ch = *p++;
2217	if (ch >= 128) {
2218	    if (ascii_encoding_error(&p, &s, errors,
2219				      "ordinal not in range(128)"))
2220		goto onError;
2221	}
2222	else
2223            *s++ = (char)ch;
2224    }
2225    /* Resize if error handling skipped some characters */
2226    if (s - start < PyString_GET_SIZE(repr))
2227	_PyString_Resize(&repr, s - start);
2228    return repr;
2229
2230 onError:
2231    Py_DECREF(repr);
2232    return NULL;
2233}
2234
2235PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2236{
2237    if (!PyUnicode_Check(unicode)) {
2238	PyErr_BadArgument();
2239	return NULL;
2240    }
2241    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2242				 PyUnicode_GET_SIZE(unicode),
2243				 NULL);
2244}
2245
2246#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2247
2248/* --- MBCS codecs for Windows -------------------------------------------- */
2249
2250PyObject *PyUnicode_DecodeMBCS(const char *s,
2251				int size,
2252				const char *errors)
2253{
2254    PyUnicodeObject *v;
2255    Py_UNICODE *p;
2256
2257    /* First get the size of the result */
2258    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2259    if (size > 0 && usize==0)
2260        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2261
2262    v = _PyUnicode_New(usize);
2263    if (v == NULL)
2264        return NULL;
2265    if (usize == 0)
2266	return (PyObject *)v;
2267    p = PyUnicode_AS_UNICODE(v);
2268    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2269        Py_DECREF(v);
2270        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2271    }
2272
2273    return (PyObject *)v;
2274}
2275
2276PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2277				int size,
2278				const char *errors)
2279{
2280    PyObject *repr;
2281    char *s;
2282    DWORD mbcssize;
2283
2284    /* If there are no characters, bail now! */
2285    if (size==0)
2286	    return PyString_FromString("");
2287
2288    /* First get the size of the result */
2289    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2290    if (mbcssize==0)
2291        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2292
2293    repr = PyString_FromStringAndSize(NULL, mbcssize);
2294    if (repr == NULL)
2295        return NULL;
2296    if (mbcssize == 0)
2297        return repr;
2298
2299    /* Do the conversion */
2300    s = PyString_AS_STRING(repr);
2301    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2302        Py_DECREF(repr);
2303        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2304    }
2305    return repr;
2306}
2307
2308#endif /* MS_WIN32 */
2309
2310/* --- Character Mapping Codec -------------------------------------------- */
2311
2312static
2313int charmap_decoding_error(const char **source,
2314			 Py_UNICODE **dest,
2315			 const char *errors,
2316			 const char *details)
2317{
2318    if ((errors == NULL) ||
2319	(strcmp(errors,"strict") == 0)) {
2320	PyErr_Format(PyExc_UnicodeError,
2321		     "charmap decoding error: %.400s",
2322		     details);
2323	return -1;
2324    }
2325    else if (strcmp(errors,"ignore") == 0) {
2326	return 0;
2327    }
2328    else if (strcmp(errors,"replace") == 0) {
2329	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2330	(*dest)++;
2331	return 0;
2332    }
2333    else {
2334	PyErr_Format(PyExc_ValueError,
2335		     "charmap decoding error; "
2336		     "unknown error handling code: %.400s",
2337		     errors);
2338	return -1;
2339    }
2340}
2341
2342PyObject *PyUnicode_DecodeCharmap(const char *s,
2343				  int size,
2344				  PyObject *mapping,
2345				  const char *errors)
2346{
2347    PyUnicodeObject *v;
2348    Py_UNICODE *p;
2349    int extrachars = 0;
2350
2351    /* Default to Latin-1 */
2352    if (mapping == NULL)
2353	return PyUnicode_DecodeLatin1(s, size, errors);
2354
2355    v = _PyUnicode_New(size);
2356    if (v == NULL)
2357	goto onError;
2358    if (size == 0)
2359	return (PyObject *)v;
2360    p = PyUnicode_AS_UNICODE(v);
2361    while (size-- > 0) {
2362	unsigned char ch = *s++;
2363	PyObject *w, *x;
2364
2365	/* Get mapping (char ordinal -> integer, Unicode char or None) */
2366	w = PyInt_FromLong((long)ch);
2367	if (w == NULL)
2368	    goto onError;
2369	x = PyObject_GetItem(mapping, w);
2370	Py_DECREF(w);
2371	if (x == NULL) {
2372	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2373		/* No mapping found means: mapping is undefined. */
2374		PyErr_Clear();
2375		x = Py_None;
2376		Py_INCREF(x);
2377	    } else
2378		goto onError;
2379	}
2380
2381	/* Apply mapping */
2382	if (PyInt_Check(x)) {
2383	    long value = PyInt_AS_LONG(x);
2384	    if (value < 0 || value > 65535) {
2385		PyErr_SetString(PyExc_TypeError,
2386				"character mapping must be in range(65536)");
2387		Py_DECREF(x);
2388		goto onError;
2389	    }
2390	    *p++ = (Py_UNICODE)value;
2391	}
2392	else if (x == Py_None) {
2393	    /* undefined mapping */
2394	    if (charmap_decoding_error(&s, &p, errors,
2395				       "character maps to <undefined>")) {
2396		Py_DECREF(x);
2397		goto onError;
2398	    }
2399	}
2400	else if (PyUnicode_Check(x)) {
2401	    int targetsize = PyUnicode_GET_SIZE(x);
2402
2403	    if (targetsize == 1)
2404		/* 1-1 mapping */
2405		*p++ = *PyUnicode_AS_UNICODE(x);
2406
2407	    else if (targetsize > 1) {
2408		/* 1-n mapping */
2409		if (targetsize > extrachars) {
2410		    /* resize first */
2411		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2412		    int needed = (targetsize - extrachars) + \
2413			         (targetsize << 2);
2414		    extrachars += needed;
2415		    if (_PyUnicode_Resize(&v,
2416					 PyUnicode_GET_SIZE(v) + needed)) {
2417			Py_DECREF(x);
2418			goto onError;
2419		    }
2420		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2421		}
2422		Py_UNICODE_COPY(p,
2423				PyUnicode_AS_UNICODE(x),
2424				targetsize);
2425		p += targetsize;
2426		extrachars -= targetsize;
2427	    }
2428	    /* 1-0 mapping: skip the character */
2429	}
2430	else {
2431	    /* wrong return value */
2432	    PyErr_SetString(PyExc_TypeError,
2433		  "character mapping must return integer, None or unicode");
2434	    Py_DECREF(x);
2435	    goto onError;
2436	}
2437	Py_DECREF(x);
2438    }
2439    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2440	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2441	    goto onError;
2442    return (PyObject *)v;
2443
2444 onError:
2445    Py_XDECREF(v);
2446    return NULL;
2447}
2448
2449static
2450int charmap_encoding_error(const Py_UNICODE **source,
2451			   char **dest,
2452			   const char *errors,
2453			   const char *details)
2454{
2455    if ((errors == NULL) ||
2456	(strcmp(errors,"strict") == 0)) {
2457	PyErr_Format(PyExc_UnicodeError,
2458		     "charmap encoding error: %.400s",
2459		     details);
2460	return -1;
2461    }
2462    else if (strcmp(errors,"ignore") == 0) {
2463	return 0;
2464    }
2465    else if (strcmp(errors,"replace") == 0) {
2466	**dest = '?';
2467	(*dest)++;
2468	return 0;
2469    }
2470    else {
2471	PyErr_Format(PyExc_ValueError,
2472		     "charmap encoding error; "
2473		     "unknown error handling code: %.400s",
2474		     errors);
2475	return -1;
2476    }
2477}
2478
2479PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2480				  int size,
2481				  PyObject *mapping,
2482				  const char *errors)
2483{
2484    PyObject *v;
2485    char *s;
2486    int extrachars = 0;
2487
2488    /* Default to Latin-1 */
2489    if (mapping == NULL)
2490	return PyUnicode_EncodeLatin1(p, size, errors);
2491
2492    v = PyString_FromStringAndSize(NULL, size);
2493    if (v == NULL)
2494        return NULL;
2495    if (size == 0)
2496	return v;
2497    s = PyString_AS_STRING(v);
2498    while (size-- > 0) {
2499	Py_UNICODE ch = *p++;
2500	PyObject *w, *x;
2501
2502	/* Get mapping (Unicode ordinal -> string char, integer or None) */
2503	w = PyInt_FromLong((long)ch);
2504	if (w == NULL)
2505	    goto onError;
2506	x = PyObject_GetItem(mapping, w);
2507	Py_DECREF(w);
2508	if (x == NULL) {
2509	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2510		/* No mapping found means: mapping is undefined. */
2511		PyErr_Clear();
2512		x = Py_None;
2513		Py_INCREF(x);
2514	    } else
2515		goto onError;
2516	}
2517
2518	/* Apply mapping */
2519	if (PyInt_Check(x)) {
2520	    long value = PyInt_AS_LONG(x);
2521	    if (value < 0 || value > 255) {
2522		PyErr_SetString(PyExc_TypeError,
2523				"character mapping must be in range(256)");
2524		Py_DECREF(x);
2525		goto onError;
2526	    }
2527	    *s++ = (char)value;
2528	}
2529	else if (x == Py_None) {
2530	    /* undefined mapping */
2531	    if (charmap_encoding_error(&p, &s, errors,
2532				       "character maps to <undefined>")) {
2533		Py_DECREF(x);
2534		goto onError;
2535	    }
2536	}
2537	else if (PyString_Check(x)) {
2538	    int targetsize = PyString_GET_SIZE(x);
2539
2540	    if (targetsize == 1)
2541		/* 1-1 mapping */
2542		*s++ = *PyString_AS_STRING(x);
2543
2544	    else if (targetsize > 1) {
2545		/* 1-n mapping */
2546		if (targetsize > extrachars) {
2547		    /* resize first */
2548		    int oldpos = (int)(s - PyString_AS_STRING(v));
2549		    int needed = (targetsize - extrachars) + \
2550			         (targetsize << 2);
2551		    extrachars += needed;
2552		    if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2553			Py_DECREF(x);
2554			goto onError;
2555		    }
2556		    s = PyString_AS_STRING(v) + oldpos;
2557		}
2558		memcpy(s, PyString_AS_STRING(x), targetsize);
2559		s += targetsize;
2560		extrachars -= targetsize;
2561	    }
2562	    /* 1-0 mapping: skip the character */
2563	}
2564	else {
2565	    /* wrong return value */
2566	    PyErr_SetString(PyExc_TypeError,
2567		  "character mapping must return integer, None or unicode");
2568	    Py_DECREF(x);
2569	    goto onError;
2570	}
2571	Py_DECREF(x);
2572    }
2573    if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2574	_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)));
2575    return v;
2576
2577 onError:
2578    Py_XDECREF(v);
2579    return NULL;
2580}
2581
2582PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2583				    PyObject *mapping)
2584{
2585    if (!PyUnicode_Check(unicode) || mapping == NULL) {
2586	PyErr_BadArgument();
2587	return NULL;
2588    }
2589    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2590				   PyUnicode_GET_SIZE(unicode),
2591				   mapping,
2592				   NULL);
2593}
2594
2595static
2596int translate_error(const Py_UNICODE **source,
2597		    Py_UNICODE **dest,
2598		    const char *errors,
2599		    const char *details)
2600{
2601    if ((errors == NULL) ||
2602	(strcmp(errors,"strict") == 0)) {
2603	PyErr_Format(PyExc_UnicodeError,
2604		     "translate error: %.400s",
2605		     details);
2606	return -1;
2607    }
2608    else if (strcmp(errors,"ignore") == 0) {
2609	return 0;
2610    }
2611    else if (strcmp(errors,"replace") == 0) {
2612	**dest = '?';
2613	(*dest)++;
2614	return 0;
2615    }
2616    else {
2617	PyErr_Format(PyExc_ValueError,
2618		     "translate error; "
2619		     "unknown error handling code: %.400s",
2620		     errors);
2621	return -1;
2622    }
2623}
2624
2625PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2626				     int size,
2627				     PyObject *mapping,
2628				     const char *errors)
2629{
2630    PyUnicodeObject *v;
2631    Py_UNICODE *p;
2632
2633    if (mapping == NULL) {
2634	PyErr_BadArgument();
2635	return NULL;
2636    }
2637
2638    /* Output will never be longer than input */
2639    v = _PyUnicode_New(size);
2640    if (v == NULL)
2641	goto onError;
2642    if (size == 0)
2643	goto done;
2644    p = PyUnicode_AS_UNICODE(v);
2645    while (size-- > 0) {
2646	Py_UNICODE ch = *s++;
2647	PyObject *w, *x;
2648
2649	/* Get mapping */
2650	w = PyInt_FromLong(ch);
2651	if (w == NULL)
2652	    goto onError;
2653	x = PyObject_GetItem(mapping, w);
2654	Py_DECREF(w);
2655	if (x == NULL) {
2656	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2657		/* No mapping found: default to 1-1 mapping */
2658		PyErr_Clear();
2659		*p++ = ch;
2660		continue;
2661	    }
2662	    goto onError;
2663	}
2664
2665	/* Apply mapping */
2666	if (PyInt_Check(x))
2667	    *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2668	else if (x == Py_None) {
2669	    /* undefined mapping */
2670	    if (translate_error(&s, &p, errors,
2671				"character maps to <undefined>")) {
2672		Py_DECREF(x);
2673		goto onError;
2674	    }
2675	}
2676	else if (PyUnicode_Check(x)) {
2677	    if (PyUnicode_GET_SIZE(x) != 1) {
2678		/* 1-n mapping */
2679		PyErr_SetString(PyExc_NotImplementedError,
2680				"1-n mappings are currently not implemented");
2681		Py_DECREF(x);
2682		goto onError;
2683	    }
2684	    *p++ = *PyUnicode_AS_UNICODE(x);
2685	}
2686	else {
2687	    /* wrong return value */
2688	    PyErr_SetString(PyExc_TypeError,
2689		  "translate mapping must return integer, None or unicode");
2690	    Py_DECREF(x);
2691	    goto onError;
2692	}
2693	Py_DECREF(x);
2694    }
2695    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2696	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2697	    goto onError;
2698
2699 done:
2700    return (PyObject *)v;
2701
2702 onError:
2703    Py_XDECREF(v);
2704    return NULL;
2705}
2706
2707PyObject *PyUnicode_Translate(PyObject *str,
2708			      PyObject *mapping,
2709			      const char *errors)
2710{
2711    PyObject *result;
2712
2713    str = PyUnicode_FromObject(str);
2714    if (str == NULL)
2715	goto onError;
2716    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2717					PyUnicode_GET_SIZE(str),
2718					mapping,
2719					errors);
2720    Py_DECREF(str);
2721    return result;
2722
2723 onError:
2724    Py_XDECREF(str);
2725    return NULL;
2726}
2727
2728/* --- Decimal Encoder ---------------------------------------------------- */
2729
2730int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2731			    int length,
2732			    char *output,
2733			    const char *errors)
2734{
2735    Py_UNICODE *p, *end;
2736
2737    if (output == NULL) {
2738	PyErr_BadArgument();
2739	return -1;
2740    }
2741
2742    p = s;
2743    end = s + length;
2744    while (p < end) {
2745	register Py_UNICODE ch = *p++;
2746	int decimal;
2747
2748	if (Py_UNICODE_ISSPACE(ch)) {
2749	    *output++ = ' ';
2750	    continue;
2751	}
2752	decimal = Py_UNICODE_TODECIMAL(ch);
2753	if (decimal >= 0) {
2754	    *output++ = '0' + decimal;
2755	    continue;
2756	}
2757	if (0 < ch && ch < 256) {
2758	    *output++ = (char)ch;
2759	    continue;
2760	}
2761	/* All other characters are considered invalid */
2762	if (errors == NULL || strcmp(errors, "strict") == 0) {
2763	    PyErr_SetString(PyExc_ValueError,
2764			    "invalid decimal Unicode string");
2765	    goto onError;
2766	}
2767	else if (strcmp(errors, "ignore") == 0)
2768	    continue;
2769	else if (strcmp(errors, "replace") == 0) {
2770	    *output++ = '?';
2771	    continue;
2772	}
2773    }
2774    /* 0-terminate the output string */
2775    *output++ = '\0';
2776    return 0;
2777
2778 onError:
2779    return -1;
2780}
2781
2782/* --- Helpers ------------------------------------------------------------ */
2783
2784static
2785int count(PyUnicodeObject *self,
2786	  int start,
2787	  int end,
2788	  PyUnicodeObject *substring)
2789{
2790    int count = 0;
2791
2792    if (start < 0)
2793        start += self->length;
2794    if (start < 0)
2795        start = 0;
2796    if (end > self->length)
2797        end = self->length;
2798    if (end < 0)
2799        end += self->length;
2800    if (end < 0)
2801        end = 0;
2802
2803    if (substring->length == 0)
2804	return (end - start + 1);
2805
2806    end -= substring->length;
2807
2808    while (start <= end)
2809        if (Py_UNICODE_MATCH(self, start, substring)) {
2810            count++;
2811            start += substring->length;
2812        } else
2813            start++;
2814
2815    return count;
2816}
2817
2818int PyUnicode_Count(PyObject *str,
2819		    PyObject *substr,
2820		    int start,
2821		    int end)
2822{
2823    int result;
2824
2825    str = PyUnicode_FromObject(str);
2826    if (str == NULL)
2827	return -1;
2828    substr = PyUnicode_FromObject(substr);
2829    if (substr == NULL) {
2830	Py_DECREF(str);
2831	return -1;
2832    }
2833
2834    result = count((PyUnicodeObject *)str,
2835		   start, end,
2836		   (PyUnicodeObject *)substr);
2837
2838    Py_DECREF(str);
2839    Py_DECREF(substr);
2840    return result;
2841}
2842
2843static
2844int findstring(PyUnicodeObject *self,
2845	       PyUnicodeObject *substring,
2846	       int start,
2847	       int end,
2848	       int direction)
2849{
2850    if (start < 0)
2851        start += self->length;
2852    if (start < 0)
2853        start = 0;
2854
2855    if (substring->length == 0)
2856        return start;
2857
2858    if (end > self->length)
2859        end = self->length;
2860    if (end < 0)
2861        end += self->length;
2862    if (end < 0)
2863        end = 0;
2864
2865    end -= substring->length;
2866
2867    if (direction < 0) {
2868        for (; end >= start; end--)
2869            if (Py_UNICODE_MATCH(self, end, substring))
2870                return end;
2871    } else {
2872        for (; start <= end; start++)
2873            if (Py_UNICODE_MATCH(self, start, substring))
2874                return start;
2875    }
2876
2877    return -1;
2878}
2879
2880int PyUnicode_Find(PyObject *str,
2881		   PyObject *substr,
2882		   int start,
2883		   int end,
2884		   int direction)
2885{
2886    int result;
2887
2888    str = PyUnicode_FromObject(str);
2889    if (str == NULL)
2890	return -1;
2891    substr = PyUnicode_FromObject(substr);
2892    if (substr == NULL) {
2893	Py_DECREF(substr);
2894	return -1;
2895    }
2896
2897    result = findstring((PyUnicodeObject *)str,
2898			(PyUnicodeObject *)substr,
2899			start, end, direction);
2900    Py_DECREF(str);
2901    Py_DECREF(substr);
2902    return result;
2903}
2904
2905static
2906int tailmatch(PyUnicodeObject *self,
2907	      PyUnicodeObject *substring,
2908	      int start,
2909	      int end,
2910	      int direction)
2911{
2912    if (start < 0)
2913        start += self->length;
2914    if (start < 0)
2915        start = 0;
2916
2917    if (substring->length == 0)
2918        return 1;
2919
2920    if (end > self->length)
2921        end = self->length;
2922    if (end < 0)
2923        end += self->length;
2924    if (end < 0)
2925        end = 0;
2926
2927    end -= substring->length;
2928    if (end < start)
2929	return 0;
2930
2931    if (direction > 0) {
2932	if (Py_UNICODE_MATCH(self, end, substring))
2933	    return 1;
2934    } else {
2935        if (Py_UNICODE_MATCH(self, start, substring))
2936	    return 1;
2937    }
2938
2939    return 0;
2940}
2941
2942int PyUnicode_Tailmatch(PyObject *str,
2943			PyObject *substr,
2944			int start,
2945			int end,
2946			int direction)
2947{
2948    int result;
2949
2950    str = PyUnicode_FromObject(str);
2951    if (str == NULL)
2952	return -1;
2953    substr = PyUnicode_FromObject(substr);
2954    if (substr == NULL) {
2955	Py_DECREF(substr);
2956	return -1;
2957    }
2958
2959    result = tailmatch((PyUnicodeObject *)str,
2960		       (PyUnicodeObject *)substr,
2961		       start, end, direction);
2962    Py_DECREF(str);
2963    Py_DECREF(substr);
2964    return result;
2965}
2966
2967static
2968const Py_UNICODE *findchar(const Py_UNICODE *s,
2969		     int size,
2970		     Py_UNICODE ch)
2971{
2972    /* like wcschr, but doesn't stop at NULL characters */
2973
2974    while (size-- > 0) {
2975        if (*s == ch)
2976            return s;
2977        s++;
2978    }
2979
2980    return NULL;
2981}
2982
2983/* Apply fixfct filter to the Unicode object self and return a
2984   reference to the modified object */
2985
2986static
2987PyObject *fixup(PyUnicodeObject *self,
2988		int (*fixfct)(PyUnicodeObject *s))
2989{
2990
2991    PyUnicodeObject *u;
2992
2993    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
2994    if (u == NULL)
2995	return NULL;
2996
2997    Py_UNICODE_COPY(u->str, self->str, self->length);
2998
2999    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3000	/* fixfct should return TRUE if it modified the buffer. If
3001	   FALSE, return a reference to the original buffer instead
3002	   (to save space, not time) */
3003	Py_INCREF(self);
3004	Py_DECREF(u);
3005	return (PyObject*) self;
3006    }
3007    return (PyObject*) u;
3008}
3009
3010static
3011int fixupper(PyUnicodeObject *self)
3012{
3013    int len = self->length;
3014    Py_UNICODE *s = self->str;
3015    int status = 0;
3016
3017    while (len-- > 0) {
3018	register Py_UNICODE ch;
3019
3020	ch = Py_UNICODE_TOUPPER(*s);
3021	if (ch != *s) {
3022            status = 1;
3023	    *s = ch;
3024	}
3025        s++;
3026    }
3027
3028    return status;
3029}
3030
3031static
3032int fixlower(PyUnicodeObject *self)
3033{
3034    int len = self->length;
3035    Py_UNICODE *s = self->str;
3036    int status = 0;
3037
3038    while (len-- > 0) {
3039	register Py_UNICODE ch;
3040
3041	ch = Py_UNICODE_TOLOWER(*s);
3042	if (ch != *s) {
3043            status = 1;
3044	    *s = ch;
3045	}
3046        s++;
3047    }
3048
3049    return status;
3050}
3051
3052static
3053int fixswapcase(PyUnicodeObject *self)
3054{
3055    int len = self->length;
3056    Py_UNICODE *s = self->str;
3057    int status = 0;
3058
3059    while (len-- > 0) {
3060        if (Py_UNICODE_ISUPPER(*s)) {
3061            *s = Py_UNICODE_TOLOWER(*s);
3062            status = 1;
3063        } else if (Py_UNICODE_ISLOWER(*s)) {
3064            *s = Py_UNICODE_TOUPPER(*s);
3065            status = 1;
3066        }
3067        s++;
3068    }
3069
3070    return status;
3071}
3072
3073static
3074int fixcapitalize(PyUnicodeObject *self)
3075{
3076    int len = self->length;
3077    Py_UNICODE *s = self->str;
3078    int status = 0;
3079
3080    if (len == 0)
3081	return 0;
3082    if (Py_UNICODE_ISLOWER(*s)) {
3083	*s = Py_UNICODE_TOUPPER(*s);
3084	status = 1;
3085    }
3086    s++;
3087    while (--len > 0) {
3088        if (Py_UNICODE_ISUPPER(*s)) {
3089            *s = Py_UNICODE_TOLOWER(*s);
3090            status = 1;
3091        }
3092        s++;
3093    }
3094    return status;
3095}
3096
3097static
3098int fixtitle(PyUnicodeObject *self)
3099{
3100    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3101    register Py_UNICODE *e;
3102    int previous_is_cased;
3103
3104    /* Shortcut for single character strings */
3105    if (PyUnicode_GET_SIZE(self) == 1) {
3106	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3107	if (*p != ch) {
3108	    *p = ch;
3109	    return 1;
3110	}
3111	else
3112	    return 0;
3113    }
3114
3115    e = p + PyUnicode_GET_SIZE(self);
3116    previous_is_cased = 0;
3117    for (; p < e; p++) {
3118	register const Py_UNICODE ch = *p;
3119
3120	if (previous_is_cased)
3121	    *p = Py_UNICODE_TOLOWER(ch);
3122	else
3123	    *p = Py_UNICODE_TOTITLE(ch);
3124
3125	if (Py_UNICODE_ISLOWER(ch) ||
3126	    Py_UNICODE_ISUPPER(ch) ||
3127	    Py_UNICODE_ISTITLE(ch))
3128	    previous_is_cased = 1;
3129	else
3130	    previous_is_cased = 0;
3131    }
3132    return 1;
3133}
3134
3135PyObject *PyUnicode_Join(PyObject *separator,
3136			 PyObject *seq)
3137{
3138    Py_UNICODE *sep;
3139    int seplen;
3140    PyUnicodeObject *res = NULL;
3141    int reslen = 0;
3142    Py_UNICODE *p;
3143    int sz = 100;
3144    int i;
3145    PyObject *it;
3146
3147    it = PyObject_GetIter(seq);
3148    if (it == NULL)
3149        return NULL;
3150
3151    if (separator == NULL) {
3152	Py_UNICODE blank = ' ';
3153	sep = &blank;
3154	seplen = 1;
3155    }
3156    else {
3157	separator = PyUnicode_FromObject(separator);
3158	if (separator == NULL)
3159	    goto onError;
3160	sep = PyUnicode_AS_UNICODE(separator);
3161	seplen = PyUnicode_GET_SIZE(separator);
3162    }
3163
3164    res = _PyUnicode_New(sz);
3165    if (res == NULL)
3166	goto onError;
3167    p = PyUnicode_AS_UNICODE(res);
3168    reslen = 0;
3169
3170    for (i = 0; ; ++i) {
3171	int itemlen;
3172	PyObject *item = PyIter_Next(it);
3173	if (item == NULL) {
3174	    if (PyErr_Occurred())
3175		goto onError;
3176	    break;
3177	}
3178	if (!PyUnicode_Check(item)) {
3179	    PyObject *v;
3180	    if (!PyString_Check(item)) {
3181		PyErr_Format(PyExc_TypeError,
3182			     "sequence item %i: expected string or Unicode,"
3183			     " %.80s found",
3184			     i, item->ob_type->tp_name);
3185		Py_DECREF(item);
3186		goto onError;
3187	    }
3188	    v = PyUnicode_FromObject(item);
3189	    Py_DECREF(item);
3190	    item = v;
3191	    if (item == NULL)
3192		goto onError;
3193	}
3194	itemlen = PyUnicode_GET_SIZE(item);
3195	while (reslen + itemlen + seplen >= sz) {
3196	    if (_PyUnicode_Resize(&res, sz*2)) {
3197		Py_DECREF(item);
3198		goto onError;
3199	    }
3200	    sz *= 2;
3201	    p = PyUnicode_AS_UNICODE(res) + reslen;
3202	}
3203	if (i > 0) {
3204	    Py_UNICODE_COPY(p, sep, seplen);
3205	    p += seplen;
3206	    reslen += seplen;
3207	}
3208	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3209	p += itemlen;
3210	reslen += itemlen;
3211	Py_DECREF(item);
3212    }
3213    if (_PyUnicode_Resize(&res, reslen))
3214	goto onError;
3215
3216    Py_XDECREF(separator);
3217    Py_DECREF(it);
3218    return (PyObject *)res;
3219
3220 onError:
3221    Py_XDECREF(separator);
3222    Py_XDECREF(res);
3223    Py_DECREF(it);
3224    return NULL;
3225}
3226
3227static
3228PyUnicodeObject *pad(PyUnicodeObject *self,
3229		     int left,
3230		     int right,
3231		     Py_UNICODE fill)
3232{
3233    PyUnicodeObject *u;
3234
3235    if (left < 0)
3236        left = 0;
3237    if (right < 0)
3238        right = 0;
3239
3240    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3241        Py_INCREF(self);
3242        return self;
3243    }
3244
3245    u = _PyUnicode_New(left + self->length + right);
3246    if (u) {
3247        if (left)
3248            Py_UNICODE_FILL(u->str, fill, left);
3249        Py_UNICODE_COPY(u->str + left, self->str, self->length);
3250        if (right)
3251            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3252    }
3253
3254    return u;
3255}
3256
3257#define SPLIT_APPEND(data, left, right)					\
3258	str = PyUnicode_FromUnicode(data + left, right - left);		\
3259	if (!str)							\
3260	    goto onError;						\
3261	if (PyList_Append(list, str)) {					\
3262	    Py_DECREF(str);						\
3263	    goto onError;						\
3264	}								\
3265        else								\
3266            Py_DECREF(str);
3267
3268static
3269PyObject *split_whitespace(PyUnicodeObject *self,
3270			   PyObject *list,
3271			   int maxcount)
3272{
3273    register int i;
3274    register int j;
3275    int len = self->length;
3276    PyObject *str;
3277
3278    for (i = j = 0; i < len; ) {
3279	/* find a token */
3280	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3281	    i++;
3282	j = i;
3283	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3284	    i++;
3285	if (j < i) {
3286	    if (maxcount-- <= 0)
3287		break;
3288	    SPLIT_APPEND(self->str, j, i);
3289	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3290		i++;
3291	    j = i;
3292	}
3293    }
3294    if (j < len) {
3295	SPLIT_APPEND(self->str, j, len);
3296    }
3297    return list;
3298
3299 onError:
3300    Py_DECREF(list);
3301    return NULL;
3302}
3303
3304PyObject *PyUnicode_Splitlines(PyObject *string,
3305			       int keepends)
3306{
3307    register int i;
3308    register int j;
3309    int len;
3310    PyObject *list;
3311    PyObject *str;
3312    Py_UNICODE *data;
3313
3314    string = PyUnicode_FromObject(string);
3315    if (string == NULL)
3316	return NULL;
3317    data = PyUnicode_AS_UNICODE(string);
3318    len = PyUnicode_GET_SIZE(string);
3319
3320    list = PyList_New(0);
3321    if (!list)
3322        goto onError;
3323
3324    for (i = j = 0; i < len; ) {
3325	int eol;
3326
3327	/* Find a line and append it */
3328	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3329	    i++;
3330
3331	/* Skip the line break reading CRLF as one line break */
3332	eol = i;
3333	if (i < len) {
3334	    if (data[i] == '\r' && i + 1 < len &&
3335		data[i+1] == '\n')
3336		i += 2;
3337	    else
3338		i++;
3339	    if (keepends)
3340		eol = i;
3341	}
3342	SPLIT_APPEND(data, j, eol);
3343	j = i;
3344    }
3345    if (j < len) {
3346	SPLIT_APPEND(data, j, len);
3347    }
3348
3349    Py_DECREF(string);
3350    return list;
3351
3352 onError:
3353    Py_DECREF(list);
3354    Py_DECREF(string);
3355    return NULL;
3356}
3357
3358static
3359PyObject *split_char(PyUnicodeObject *self,
3360		     PyObject *list,
3361		     Py_UNICODE ch,
3362		     int maxcount)
3363{
3364    register int i;
3365    register int j;
3366    int len = self->length;
3367    PyObject *str;
3368
3369    for (i = j = 0; i < len; ) {
3370	if (self->str[i] == ch) {
3371	    if (maxcount-- <= 0)
3372		break;
3373	    SPLIT_APPEND(self->str, j, i);
3374	    i = j = i + 1;
3375	} else
3376	    i++;
3377    }
3378    if (j <= len) {
3379	SPLIT_APPEND(self->str, j, len);
3380    }
3381    return list;
3382
3383 onError:
3384    Py_DECREF(list);
3385    return NULL;
3386}
3387
3388static
3389PyObject *split_substring(PyUnicodeObject *self,
3390			  PyObject *list,
3391			  PyUnicodeObject *substring,
3392			  int maxcount)
3393{
3394    register int i;
3395    register int j;
3396    int len = self->length;
3397    int sublen = substring->length;
3398    PyObject *str;
3399
3400    for (i = j = 0; i <= len - sublen; ) {
3401	if (Py_UNICODE_MATCH(self, i, substring)) {
3402	    if (maxcount-- <= 0)
3403		break;
3404	    SPLIT_APPEND(self->str, j, i);
3405	    i = j = i + sublen;
3406	} else
3407	    i++;
3408    }
3409    if (j <= len) {
3410	SPLIT_APPEND(self->str, j, len);
3411    }
3412    return list;
3413
3414 onError:
3415    Py_DECREF(list);
3416    return NULL;
3417}
3418
3419#undef SPLIT_APPEND
3420
3421static
3422PyObject *split(PyUnicodeObject *self,
3423		PyUnicodeObject *substring,
3424		int maxcount)
3425{
3426    PyObject *list;
3427
3428    if (maxcount < 0)
3429        maxcount = INT_MAX;
3430
3431    list = PyList_New(0);
3432    if (!list)
3433        return NULL;
3434
3435    if (substring == NULL)
3436	return split_whitespace(self,list,maxcount);
3437
3438    else if (substring->length == 1)
3439	return split_char(self,list,substring->str[0],maxcount);
3440
3441    else if (substring->length == 0) {
3442	Py_DECREF(list);
3443	PyErr_SetString(PyExc_ValueError, "empty separator");
3444	return NULL;
3445    }
3446    else
3447	return split_substring(self,list,substring,maxcount);
3448}
3449
3450static
3451PyObject *replace(PyUnicodeObject *self,
3452		  PyUnicodeObject *str1,
3453		  PyUnicodeObject *str2,
3454		  int maxcount)
3455{
3456    PyUnicodeObject *u;
3457
3458    if (maxcount < 0)
3459	maxcount = INT_MAX;
3460
3461    if (str1->length == 1 && str2->length == 1) {
3462        int i;
3463
3464        /* replace characters */
3465        if (!findchar(self->str, self->length, str1->str[0]) &&
3466            PyUnicode_CheckExact(self)) {
3467            /* nothing to replace, return original string */
3468            Py_INCREF(self);
3469            u = self;
3470        } else {
3471	    Py_UNICODE u1 = str1->str[0];
3472	    Py_UNICODE u2 = str2->str[0];
3473
3474            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3475                NULL,
3476                self->length
3477                );
3478            if (u != NULL) {
3479		Py_UNICODE_COPY(u->str, self->str,
3480				self->length);
3481                for (i = 0; i < u->length; i++)
3482                    if (u->str[i] == u1) {
3483                        if (--maxcount < 0)
3484                            break;
3485                        u->str[i] = u2;
3486                    }
3487        }
3488        }
3489
3490    } else {
3491        int n, i;
3492        Py_UNICODE *p;
3493
3494        /* replace strings */
3495        n = count(self, 0, self->length, str1);
3496        if (n > maxcount)
3497            n = maxcount;
3498        if (n == 0 && PyUnicode_CheckExact(self)) {
3499            /* nothing to replace, return original string */
3500            Py_INCREF(self);
3501            u = self;
3502        } else {
3503            u = _PyUnicode_New(
3504                self->length + n * (str2->length - str1->length));
3505            if (u) {
3506                i = 0;
3507                p = u->str;
3508                while (i <= self->length - str1->length)
3509                    if (Py_UNICODE_MATCH(self, i, str1)) {
3510                        /* replace string segment */
3511                        Py_UNICODE_COPY(p, str2->str, str2->length);
3512                        p += str2->length;
3513                        i += str1->length;
3514                        if (--n <= 0) {
3515                            /* copy remaining part */
3516                            Py_UNICODE_COPY(p, self->str+i, self->length-i);
3517                            break;
3518                        }
3519                    } else
3520                        *p++ = self->str[i++];
3521            }
3522        }
3523    }
3524
3525    return (PyObject *) u;
3526}
3527
3528/* --- Unicode Object Methods --------------------------------------------- */
3529
3530static char title__doc__[] =
3531"S.title() -> unicode\n\
3532\n\
3533Return a titlecased version of S, i.e. words start with title case\n\
3534characters, all remaining cased characters have lower case.";
3535
3536static PyObject*
3537unicode_title(PyUnicodeObject *self)
3538{
3539    return fixup(self, fixtitle);
3540}
3541
3542static char capitalize__doc__[] =
3543"S.capitalize() -> unicode\n\
3544\n\
3545Return a capitalized version of S, i.e. make the first character\n\
3546have upper case.";
3547
3548static PyObject*
3549unicode_capitalize(PyUnicodeObject *self)
3550{
3551    return fixup(self, fixcapitalize);
3552}
3553
3554#if 0
3555static char capwords__doc__[] =
3556"S.capwords() -> unicode\n\
3557\n\
3558Apply .capitalize() to all words in S and return the result with\n\
3559normalized whitespace (all whitespace strings are replaced by ' ').";
3560
3561static PyObject*
3562unicode_capwords(PyUnicodeObject *self)
3563{
3564    PyObject *list;
3565    PyObject *item;
3566    int i;
3567
3568    /* Split into words */
3569    list = split(self, NULL, -1);
3570    if (!list)
3571        return NULL;
3572
3573    /* Capitalize each word */
3574    for (i = 0; i < PyList_GET_SIZE(list); i++) {
3575        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3576		     fixcapitalize);
3577        if (item == NULL)
3578            goto onError;
3579        Py_DECREF(PyList_GET_ITEM(list, i));
3580        PyList_SET_ITEM(list, i, item);
3581    }
3582
3583    /* Join the words to form a new string */
3584    item = PyUnicode_Join(NULL, list);
3585
3586onError:
3587    Py_DECREF(list);
3588    return (PyObject *)item;
3589}
3590#endif
3591
3592static char center__doc__[] =
3593"S.center(width) -> unicode\n\
3594\n\
3595Return S centered in a Unicode string of length width. Padding is done\n\
3596using spaces.";
3597
3598static PyObject *
3599unicode_center(PyUnicodeObject *self, PyObject *args)
3600{
3601    int marg, left;
3602    int width;
3603
3604    if (!PyArg_ParseTuple(args, "i:center", &width))
3605        return NULL;
3606
3607    if (self->length >= width && PyUnicode_CheckExact(self)) {
3608        Py_INCREF(self);
3609        return (PyObject*) self;
3610    }
3611
3612    marg = width - self->length;
3613    left = marg / 2 + (marg & width & 1);
3614
3615    return (PyObject*) pad(self, left, marg - left, ' ');
3616}
3617
3618#if 0
3619
3620/* This code should go into some future Unicode collation support
3621   module. The basic comparison should compare ordinals on a naive
3622   basis (this is what Java does and thus JPython too). */
3623
3624/* speedy UTF-16 code point order comparison */
3625/* gleaned from: */
3626/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3627
3628static short utf16Fixup[32] =
3629{
3630    0, 0, 0, 0, 0, 0, 0, 0,
3631    0, 0, 0, 0, 0, 0, 0, 0,
3632    0, 0, 0, 0, 0, 0, 0, 0,
3633    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3634};
3635
3636static int
3637unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3638{
3639    int len1, len2;
3640
3641    Py_UNICODE *s1 = str1->str;
3642    Py_UNICODE *s2 = str2->str;
3643
3644    len1 = str1->length;
3645    len2 = str2->length;
3646
3647    while (len1 > 0 && len2 > 0) {
3648        Py_UNICODE c1, c2;
3649
3650        c1 = *s1++;
3651        c2 = *s2++;
3652
3653	if (c1 > (1<<11) * 26)
3654	    c1 += utf16Fixup[c1>>11];
3655	if (c2 > (1<<11) * 26)
3656            c2 += utf16Fixup[c2>>11];
3657        /* now c1 and c2 are in UTF-32-compatible order */
3658
3659        if (c1 != c2)
3660            return (c1 < c2) ? -1 : 1;
3661
3662        len1--; len2--;
3663    }
3664
3665    return (len1 < len2) ? -1 : (len1 != len2);
3666}
3667
3668#else
3669
3670static int
3671unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3672{
3673    register int len1, len2;
3674
3675    Py_UNICODE *s1 = str1->str;
3676    Py_UNICODE *s2 = str2->str;
3677
3678    len1 = str1->length;
3679    len2 = str2->length;
3680
3681    while (len1 > 0 && len2 > 0) {
3682        Py_UNICODE c1, c2;
3683
3684        c1 = *s1++;
3685        c2 = *s2++;
3686
3687        if (c1 != c2)
3688            return (c1 < c2) ? -1 : 1;
3689
3690        len1--; len2--;
3691    }
3692
3693    return (len1 < len2) ? -1 : (len1 != len2);
3694}
3695
3696#endif
3697
3698int PyUnicode_Compare(PyObject *left,
3699		      PyObject *right)
3700{
3701    PyUnicodeObject *u = NULL, *v = NULL;
3702    int result;
3703
3704    /* Coerce the two arguments */
3705    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3706    if (u == NULL)
3707	goto onError;
3708    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3709    if (v == NULL)
3710	goto onError;
3711
3712    /* Shortcut for empty or interned objects */
3713    if (v == u) {
3714	Py_DECREF(u);
3715	Py_DECREF(v);
3716	return 0;
3717    }
3718
3719    result = unicode_compare(u, v);
3720
3721    Py_DECREF(u);
3722    Py_DECREF(v);
3723    return result;
3724
3725onError:
3726    Py_XDECREF(u);
3727    Py_XDECREF(v);
3728    return -1;
3729}
3730
3731int PyUnicode_Contains(PyObject *container,
3732		       PyObject *element)
3733{
3734    PyUnicodeObject *u = NULL, *v = NULL;
3735    int result;
3736    register const Py_UNICODE *p, *e;
3737    register Py_UNICODE ch;
3738
3739    /* Coerce the two arguments */
3740    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3741    if (v == NULL) {
3742	PyErr_SetString(PyExc_TypeError,
3743	    "'in <string>' requires character as left operand");
3744	goto onError;
3745    }
3746    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3747    if (u == NULL) {
3748	Py_DECREF(v);
3749	goto onError;
3750    }
3751
3752    /* Check v in u */
3753    if (PyUnicode_GET_SIZE(v) != 1) {
3754	PyErr_SetString(PyExc_TypeError,
3755	    "'in <string>' requires character as left operand");
3756	goto onError;
3757    }
3758    ch = *PyUnicode_AS_UNICODE(v);
3759    p = PyUnicode_AS_UNICODE(u);
3760    e = p + PyUnicode_GET_SIZE(u);
3761    result = 0;
3762    while (p < e) {
3763	if (*p++ == ch) {
3764	    result = 1;
3765	    break;
3766	}
3767    }
3768
3769    Py_DECREF(u);
3770    Py_DECREF(v);
3771    return result;
3772
3773onError:
3774    Py_XDECREF(u);
3775    Py_XDECREF(v);
3776    return -1;
3777}
3778
3779/* Concat to string or Unicode object giving a new Unicode object. */
3780
3781PyObject *PyUnicode_Concat(PyObject *left,
3782			   PyObject *right)
3783{
3784    PyUnicodeObject *u = NULL, *v = NULL, *w;
3785
3786    /* Coerce the two arguments */
3787    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3788    if (u == NULL)
3789	goto onError;
3790    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3791    if (v == NULL)
3792	goto onError;
3793
3794    /* Shortcuts */
3795    if (v == unicode_empty) {
3796	Py_DECREF(v);
3797	return (PyObject *)u;
3798    }
3799    if (u == unicode_empty) {
3800	Py_DECREF(u);
3801	return (PyObject *)v;
3802    }
3803
3804    /* Concat the two Unicode strings */
3805    w = _PyUnicode_New(u->length + v->length);
3806    if (w == NULL)
3807	goto onError;
3808    Py_UNICODE_COPY(w->str, u->str, u->length);
3809    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3810
3811    Py_DECREF(u);
3812    Py_DECREF(v);
3813    return (PyObject *)w;
3814
3815onError:
3816    Py_XDECREF(u);
3817    Py_XDECREF(v);
3818    return NULL;
3819}
3820
3821static char count__doc__[] =
3822"S.count(sub[, start[, end]]) -> int\n\
3823\n\
3824Return the number of occurrences of substring sub in Unicode string\n\
3825S[start:end].  Optional arguments start and end are\n\
3826interpreted as in slice notation.";
3827
3828static PyObject *
3829unicode_count(PyUnicodeObject *self, PyObject *args)
3830{
3831    PyUnicodeObject *substring;
3832    int start = 0;
3833    int end = INT_MAX;
3834    PyObject *result;
3835
3836    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3837		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3838        return NULL;
3839
3840    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3841						(PyObject *)substring);
3842    if (substring == NULL)
3843	return NULL;
3844
3845    if (start < 0)
3846        start += self->length;
3847    if (start < 0)
3848        start = 0;
3849    if (end > self->length)
3850        end = self->length;
3851    if (end < 0)
3852        end += self->length;
3853    if (end < 0)
3854        end = 0;
3855
3856    result = PyInt_FromLong((long) count(self, start, end, substring));
3857
3858    Py_DECREF(substring);
3859    return result;
3860}
3861
3862static char encode__doc__[] =
3863"S.encode([encoding[,errors]]) -> string\n\
3864\n\
3865Return an encoded string version of S. Default encoding is the current\n\
3866default string encoding. errors may be given to set a different error\n\
3867handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3868a ValueError. Other possible values are 'ignore' and 'replace'.";
3869
3870static PyObject *
3871unicode_encode(PyUnicodeObject *self, PyObject *args)
3872{
3873    char *encoding = NULL;
3874    char *errors = NULL;
3875    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3876        return NULL;
3877    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3878}
3879
3880static char expandtabs__doc__[] =
3881"S.expandtabs([tabsize]) -> unicode\n\
3882\n\
3883Return a copy of S where all tab characters are expanded using spaces.\n\
3884If tabsize is not given, a tab size of 8 characters is assumed.";
3885
3886static PyObject*
3887unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3888{
3889    Py_UNICODE *e;
3890    Py_UNICODE *p;
3891    Py_UNICODE *q;
3892    int i, j;
3893    PyUnicodeObject *u;
3894    int tabsize = 8;
3895
3896    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3897	return NULL;
3898
3899    /* First pass: determine size of output string */
3900    i = j = 0;
3901    e = self->str + self->length;
3902    for (p = self->str; p < e; p++)
3903        if (*p == '\t') {
3904	    if (tabsize > 0)
3905		j += tabsize - (j % tabsize);
3906	}
3907        else {
3908            j++;
3909            if (*p == '\n' || *p == '\r') {
3910                i += j;
3911                j = 0;
3912            }
3913        }
3914
3915    /* Second pass: create output string and fill it */
3916    u = _PyUnicode_New(i + j);
3917    if (!u)
3918        return NULL;
3919
3920    j = 0;
3921    q = u->str;
3922
3923    for (p = self->str; p < e; p++)
3924        if (*p == '\t') {
3925	    if (tabsize > 0) {
3926		i = tabsize - (j % tabsize);
3927		j += i;
3928		while (i--)
3929		    *q++ = ' ';
3930	    }
3931	}
3932	else {
3933            j++;
3934	    *q++ = *p;
3935            if (*p == '\n' || *p == '\r')
3936                j = 0;
3937        }
3938
3939    return (PyObject*) u;
3940}
3941
3942static char find__doc__[] =
3943"S.find(sub [,start [,end]]) -> int\n\
3944\n\
3945Return the lowest index in S where substring sub is found,\n\
3946such that sub is contained within s[start,end].  Optional\n\
3947arguments start and end are interpreted as in slice notation.\n\
3948\n\
3949Return -1 on failure.";
3950
3951static PyObject *
3952unicode_find(PyUnicodeObject *self, PyObject *args)
3953{
3954    PyUnicodeObject *substring;
3955    int start = 0;
3956    int end = INT_MAX;
3957    PyObject *result;
3958
3959    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3960		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3961        return NULL;
3962    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3963						(PyObject *)substring);
3964    if (substring == NULL)
3965	return NULL;
3966
3967    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3968
3969    Py_DECREF(substring);
3970    return result;
3971}
3972
3973static PyObject *
3974unicode_getitem(PyUnicodeObject *self, int index)
3975{
3976    if (index < 0 || index >= self->length) {
3977        PyErr_SetString(PyExc_IndexError, "string index out of range");
3978        return NULL;
3979    }
3980
3981    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3982}
3983
3984static long
3985unicode_hash(PyUnicodeObject *self)
3986{
3987    /* Since Unicode objects compare equal to their ASCII string
3988       counterparts, they should use the individual character values
3989       as basis for their hash value.  This is needed to assure that
3990       strings and Unicode objects behave in the same way as
3991       dictionary keys. */
3992
3993    register int len;
3994    register Py_UNICODE *p;
3995    register long x;
3996
3997    if (self->hash != -1)
3998	return self->hash;
3999    len = PyUnicode_GET_SIZE(self);
4000    p = PyUnicode_AS_UNICODE(self);
4001    x = *p << 7;
4002    while (--len >= 0)
4003	x = (1000003*x) ^ *p++;
4004    x ^= PyUnicode_GET_SIZE(self);
4005    if (x == -1)
4006	x = -2;
4007    self->hash = x;
4008    return x;
4009}
4010
4011static char index__doc__[] =
4012"S.index(sub [,start [,end]]) -> int\n\
4013\n\
4014Like S.find() but raise ValueError when the substring is not found.";
4015
4016static PyObject *
4017unicode_index(PyUnicodeObject *self, PyObject *args)
4018{
4019    int result;
4020    PyUnicodeObject *substring;
4021    int start = 0;
4022    int end = INT_MAX;
4023
4024    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4025		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4026        return NULL;
4027
4028    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4029						(PyObject *)substring);
4030    if (substring == NULL)
4031	return NULL;
4032
4033    result = findstring(self, substring, start, end, 1);
4034
4035    Py_DECREF(substring);
4036    if (result < 0) {
4037        PyErr_SetString(PyExc_ValueError, "substring not found");
4038        return NULL;
4039    }
4040    return PyInt_FromLong(result);
4041}
4042
4043static char islower__doc__[] =
4044"S.islower() -> bool\n\
4045\n\
4046Return True if all cased characters in S are lowercase and there is\n\
4047at least one cased character in S, False otherwise.";
4048
4049static PyObject*
4050unicode_islower(PyUnicodeObject *self)
4051{
4052    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4053    register const Py_UNICODE *e;
4054    int cased;
4055
4056    /* Shortcut for single character strings */
4057    if (PyUnicode_GET_SIZE(self) == 1)
4058	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
4059
4060    /* Special case for empty strings */
4061    if (PyString_GET_SIZE(self) == 0)
4062	return PyBool_FromLong(0);
4063
4064    e = p + PyUnicode_GET_SIZE(self);
4065    cased = 0;
4066    for (; p < e; p++) {
4067	register const Py_UNICODE ch = *p;
4068
4069	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4070	    return PyBool_FromLong(0);
4071	else if (!cased && Py_UNICODE_ISLOWER(ch))
4072	    cased = 1;
4073    }
4074    return PyBool_FromLong(cased);
4075}
4076
4077static char isupper__doc__[] =
4078"S.isupper() -> bool\n\
4079\n\
4080Return True if  all cased characters in S are uppercase and there is\n\
4081at least one cased character in S, False otherwise.";
4082
4083static PyObject*
4084unicode_isupper(PyUnicodeObject *self)
4085{
4086    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4087    register const Py_UNICODE *e;
4088    int cased;
4089
4090    /* Shortcut for single character strings */
4091    if (PyUnicode_GET_SIZE(self) == 1)
4092	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4093
4094    /* Special case for empty strings */
4095    if (PyString_GET_SIZE(self) == 0)
4096	return PyBool_FromLong(0);
4097
4098    e = p + PyUnicode_GET_SIZE(self);
4099    cased = 0;
4100    for (; p < e; p++) {
4101	register const Py_UNICODE ch = *p;
4102
4103	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4104	    return PyBool_FromLong(0);
4105	else if (!cased && Py_UNICODE_ISUPPER(ch))
4106	    cased = 1;
4107    }
4108    return PyBool_FromLong(cased);
4109}
4110
4111static char istitle__doc__[] =
4112"S.istitle() -> bool\n\
4113\n\
4114Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4115characters may only follow uncased characters and lowercase characters\n\
4116only cased ones. Return False otherwise.";
4117
4118static PyObject*
4119unicode_istitle(PyUnicodeObject *self)
4120{
4121    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4122    register const Py_UNICODE *e;
4123    int cased, previous_is_cased;
4124
4125    /* Shortcut for single character strings */
4126    if (PyUnicode_GET_SIZE(self) == 1)
4127	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4128			       (Py_UNICODE_ISUPPER(*p) != 0));
4129
4130    /* Special case for empty strings */
4131    if (PyString_GET_SIZE(self) == 0)
4132	return PyBool_FromLong(0);
4133
4134    e = p + PyUnicode_GET_SIZE(self);
4135    cased = 0;
4136    previous_is_cased = 0;
4137    for (; p < e; p++) {
4138	register const Py_UNICODE ch = *p;
4139
4140	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4141	    if (previous_is_cased)
4142		return PyBool_FromLong(0);
4143	    previous_is_cased = 1;
4144	    cased = 1;
4145	}
4146	else if (Py_UNICODE_ISLOWER(ch)) {
4147	    if (!previous_is_cased)
4148		return PyBool_FromLong(0);
4149	    previous_is_cased = 1;
4150	    cased = 1;
4151	}
4152	else
4153	    previous_is_cased = 0;
4154    }
4155    return PyBool_FromLong(cased);
4156}
4157
4158static char isspace__doc__[] =
4159"S.isspace() -> bool\n\
4160\n\
4161Return True if there are only whitespace characters in S,\n\
4162False otherwise.";
4163
4164static PyObject*
4165unicode_isspace(PyUnicodeObject *self)
4166{
4167    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4168    register const Py_UNICODE *e;
4169
4170    /* Shortcut for single character strings */
4171    if (PyUnicode_GET_SIZE(self) == 1 &&
4172	Py_UNICODE_ISSPACE(*p))
4173	return PyBool_FromLong(1);
4174
4175    /* Special case for empty strings */
4176    if (PyString_GET_SIZE(self) == 0)
4177	return PyBool_FromLong(0);
4178
4179    e = p + PyUnicode_GET_SIZE(self);
4180    for (; p < e; p++) {
4181	if (!Py_UNICODE_ISSPACE(*p))
4182	    return PyBool_FromLong(0);
4183    }
4184    return PyBool_FromLong(1);
4185}
4186
4187static char isalpha__doc__[] =
4188"S.isalpha() -> bool\n\
4189\n\
4190Return True if  all characters in S are alphabetic\n\
4191and there is at least one character in S, False otherwise.";
4192
4193static PyObject*
4194unicode_isalpha(PyUnicodeObject *self)
4195{
4196    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4197    register const Py_UNICODE *e;
4198
4199    /* Shortcut for single character strings */
4200    if (PyUnicode_GET_SIZE(self) == 1 &&
4201	Py_UNICODE_ISALPHA(*p))
4202	return PyBool_FromLong(1);
4203
4204    /* Special case for empty strings */
4205    if (PyString_GET_SIZE(self) == 0)
4206	return PyBool_FromLong(0);
4207
4208    e = p + PyUnicode_GET_SIZE(self);
4209    for (; p < e; p++) {
4210	if (!Py_UNICODE_ISALPHA(*p))
4211	    return PyBool_FromLong(0);
4212    }
4213    return PyBool_FromLong(1);
4214}
4215
4216static char isalnum__doc__[] =
4217"S.isalnum() -> bool\n\
4218\n\
4219Return True if  all characters in S are alphanumeric\n\
4220and there is at least one character in S, False otherwise.";
4221
4222static PyObject*
4223unicode_isalnum(PyUnicodeObject *self)
4224{
4225    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4226    register const Py_UNICODE *e;
4227
4228    /* Shortcut for single character strings */
4229    if (PyUnicode_GET_SIZE(self) == 1 &&
4230	Py_UNICODE_ISALNUM(*p))
4231	return PyBool_FromLong(1);
4232
4233    /* Special case for empty strings */
4234    if (PyString_GET_SIZE(self) == 0)
4235	return PyBool_FromLong(0);
4236
4237    e = p + PyUnicode_GET_SIZE(self);
4238    for (; p < e; p++) {
4239	if (!Py_UNICODE_ISALNUM(*p))
4240	    return PyBool_FromLong(0);
4241    }
4242    return PyBool_FromLong(1);
4243}
4244
4245static char isdecimal__doc__[] =
4246"S.isdecimal() -> bool\n\
4247\n\
4248Return True if there are only decimal characters in S,\n\
4249False otherwise.";
4250
4251static PyObject*
4252unicode_isdecimal(PyUnicodeObject *self)
4253{
4254    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4255    register const Py_UNICODE *e;
4256
4257    /* Shortcut for single character strings */
4258    if (PyUnicode_GET_SIZE(self) == 1 &&
4259	Py_UNICODE_ISDECIMAL(*p))
4260	return PyBool_FromLong(1);
4261
4262    /* Special case for empty strings */
4263    if (PyString_GET_SIZE(self) == 0)
4264	return PyBool_FromLong(0);
4265
4266    e = p + PyUnicode_GET_SIZE(self);
4267    for (; p < e; p++) {
4268	if (!Py_UNICODE_ISDECIMAL(*p))
4269	    return PyBool_FromLong(0);
4270    }
4271    return PyBool_FromLong(1);
4272}
4273
4274static char isdigit__doc__[] =
4275"S.isdigit() -> bool\n\
4276\n\
4277Return True if there are only digit characters in S,\n\
4278False otherwise.";
4279
4280static PyObject*
4281unicode_isdigit(PyUnicodeObject *self)
4282{
4283    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4284    register const Py_UNICODE *e;
4285
4286    /* Shortcut for single character strings */
4287    if (PyUnicode_GET_SIZE(self) == 1 &&
4288	Py_UNICODE_ISDIGIT(*p))
4289	return PyBool_FromLong(1);
4290
4291    /* Special case for empty strings */
4292    if (PyString_GET_SIZE(self) == 0)
4293	return PyBool_FromLong(0);
4294
4295    e = p + PyUnicode_GET_SIZE(self);
4296    for (; p < e; p++) {
4297	if (!Py_UNICODE_ISDIGIT(*p))
4298	    return PyBool_FromLong(0);
4299    }
4300    return PyBool_FromLong(1);
4301}
4302
4303static char isnumeric__doc__[] =
4304"S.isnumeric() -> bool\n\
4305\n\
4306Return True if there are only numeric characters in S,\n\
4307False otherwise.";
4308
4309static PyObject*
4310unicode_isnumeric(PyUnicodeObject *self)
4311{
4312    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4313    register const Py_UNICODE *e;
4314
4315    /* Shortcut for single character strings */
4316    if (PyUnicode_GET_SIZE(self) == 1 &&
4317	Py_UNICODE_ISNUMERIC(*p))
4318	return PyBool_FromLong(1);
4319
4320    /* Special case for empty strings */
4321    if (PyString_GET_SIZE(self) == 0)
4322	return PyBool_FromLong(0);
4323
4324    e = p + PyUnicode_GET_SIZE(self);
4325    for (; p < e; p++) {
4326	if (!Py_UNICODE_ISNUMERIC(*p))
4327	    return PyBool_FromLong(0);
4328    }
4329    return PyBool_FromLong(1);
4330}
4331
4332static char join__doc__[] =
4333"S.join(sequence) -> unicode\n\
4334\n\
4335Return a string which is the concatenation of the strings in the\n\
4336sequence.  The separator between elements is S.";
4337
4338static PyObject*
4339unicode_join(PyObject *self, PyObject *data)
4340{
4341    return PyUnicode_Join(self, data);
4342}
4343
4344static int
4345unicode_length(PyUnicodeObject *self)
4346{
4347    return self->length;
4348}
4349
4350static char ljust__doc__[] =
4351"S.ljust(width) -> unicode\n\
4352\n\
4353Return S left justified in a Unicode string of length width. Padding is\n\
4354done using spaces.";
4355
4356static PyObject *
4357unicode_ljust(PyUnicodeObject *self, PyObject *args)
4358{
4359    int width;
4360    if (!PyArg_ParseTuple(args, "i:ljust", &width))
4361        return NULL;
4362
4363    if (self->length >= width && PyUnicode_CheckExact(self)) {
4364        Py_INCREF(self);
4365        return (PyObject*) self;
4366    }
4367
4368    return (PyObject*) pad(self, 0, width - self->length, ' ');
4369}
4370
4371static char lower__doc__[] =
4372"S.lower() -> unicode\n\
4373\n\
4374Return a copy of the string S converted to lowercase.";
4375
4376static PyObject*
4377unicode_lower(PyUnicodeObject *self)
4378{
4379    return fixup(self, fixlower);
4380}
4381
4382#define LEFTSTRIP 0
4383#define RIGHTSTRIP 1
4384#define BOTHSTRIP 2
4385
4386/* Arrays indexed by above */
4387static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
4388
4389#define STRIPNAME(i) (stripformat[i]+3)
4390
4391static const Py_UNICODE *
4392unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
4393{
4394	size_t i;
4395	for (i = 0; i < n; ++i)
4396		if (s[i] == c)
4397			return s+i;
4398	return NULL;
4399}
4400
4401/* externally visible for str.strip(unicode) */
4402PyObject *
4403_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
4404{
4405	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4406	int len = PyUnicode_GET_SIZE(self);
4407	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
4408	int seplen = PyUnicode_GET_SIZE(sepobj);
4409	int i, j;
4410
4411	i = 0;
4412	if (striptype != RIGHTSTRIP) {
4413		while (i < len && unicode_memchr(sep, s[i], seplen)) {
4414			i++;
4415		}
4416	}
4417
4418	j = len;
4419	if (striptype != LEFTSTRIP) {
4420		do {
4421			j--;
4422		} while (j >= i && unicode_memchr(sep, s[j], seplen));
4423		j++;
4424	}
4425
4426	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4427		Py_INCREF(self);
4428		return (PyObject*)self;
4429	}
4430	else
4431		return PyUnicode_FromUnicode(s+i, j-i);
4432}
4433
4434
4435static PyObject *
4436do_strip(PyUnicodeObject *self, int striptype)
4437{
4438	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4439	int len = PyUnicode_GET_SIZE(self), i, j;
4440
4441	i = 0;
4442	if (striptype != RIGHTSTRIP) {
4443		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
4444			i++;
4445		}
4446	}
4447
4448	j = len;
4449	if (striptype != LEFTSTRIP) {
4450		do {
4451			j--;
4452		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
4453		j++;
4454	}
4455
4456	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4457		Py_INCREF(self);
4458		return (PyObject*)self;
4459	}
4460	else
4461		return PyUnicode_FromUnicode(s+i, j-i);
4462}
4463
4464
4465static PyObject *
4466do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
4467{
4468	PyObject *sep = NULL;
4469
4470	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
4471		return NULL;
4472
4473	if (sep != NULL && sep != Py_None) {
4474		if (PyUnicode_Check(sep))
4475			return _PyUnicode_XStrip(self, striptype, sep);
4476		else if (PyString_Check(sep)) {
4477			PyObject *res;
4478			sep = PyUnicode_FromObject(sep);
4479			if (sep==NULL)
4480				return NULL;
4481			res = _PyUnicode_XStrip(self, striptype, sep);
4482			Py_DECREF(sep);
4483			return res;
4484		}
4485		else {
4486			PyErr_Format(PyExc_TypeError,
4487				     "%s arg must be None, unicode or str",
4488				     STRIPNAME(striptype));
4489			return NULL;
4490		}
4491	}
4492
4493	return do_strip(self, striptype);
4494}
4495
4496
4497static char strip__doc__[] =
4498"S.strip([sep]) -> unicode\n\
4499\n\
4500Return a copy of the string S with leading and trailing\n\
4501whitespace removed.\n\
4502If sep is given and not None, remove characters in sep instead.\n\
4503If sep is a str, it will be converted to unicode before stripping";
4504
4505static PyObject *
4506unicode_strip(PyUnicodeObject *self, PyObject *args)
4507{
4508	if (PyTuple_GET_SIZE(args) == 0)
4509		return do_strip(self, BOTHSTRIP); /* Common case */
4510	else
4511		return do_argstrip(self, BOTHSTRIP, args);
4512}
4513
4514
4515static char lstrip__doc__[] =
4516"S.lstrip([sep]) -> unicode\n\
4517\n\
4518Return a copy of the string S with leading whitespace removed.\n\
4519If sep is given and not None, remove characters in sep instead.\n\
4520If sep is a str, it will be converted to unicode before stripping";
4521
4522static PyObject *
4523unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4524{
4525	if (PyTuple_GET_SIZE(args) == 0)
4526		return do_strip(self, LEFTSTRIP); /* Common case */
4527	else
4528		return do_argstrip(self, LEFTSTRIP, args);
4529}
4530
4531
4532static char rstrip__doc__[] =
4533"S.rstrip([sep]) -> unicode\n\
4534\n\
4535Return a copy of the string S with trailing whitespace removed.\n\
4536If sep is given and not None, remove characters in sep instead.\n\
4537If sep is a str, it will be converted to unicode before stripping";
4538
4539static PyObject *
4540unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4541{
4542	if (PyTuple_GET_SIZE(args) == 0)
4543		return do_strip(self, RIGHTSTRIP); /* Common case */
4544	else
4545		return do_argstrip(self, RIGHTSTRIP, args);
4546}
4547
4548
4549static PyObject*
4550unicode_repeat(PyUnicodeObject *str, int len)
4551{
4552    PyUnicodeObject *u;
4553    Py_UNICODE *p;
4554    int nchars;
4555    size_t nbytes;
4556
4557    if (len < 0)
4558        len = 0;
4559
4560    if (len == 1 && PyUnicode_CheckExact(str)) {
4561        /* no repeat, return original string */
4562        Py_INCREF(str);
4563        return (PyObject*) str;
4564    }
4565
4566    /* ensure # of chars needed doesn't overflow int and # of bytes
4567     * needed doesn't overflow size_t
4568     */
4569    nchars = len * str->length;
4570    if (len && nchars / len != str->length) {
4571        PyErr_SetString(PyExc_OverflowError,
4572                        "repeated string is too long");
4573        return NULL;
4574    }
4575    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4576    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4577        PyErr_SetString(PyExc_OverflowError,
4578                        "repeated string is too long");
4579        return NULL;
4580    }
4581    u = _PyUnicode_New(nchars);
4582    if (!u)
4583        return NULL;
4584
4585    p = u->str;
4586
4587    while (len-- > 0) {
4588        Py_UNICODE_COPY(p, str->str, str->length);
4589        p += str->length;
4590    }
4591
4592    return (PyObject*) u;
4593}
4594
4595PyObject *PyUnicode_Replace(PyObject *obj,
4596			    PyObject *subobj,
4597			    PyObject *replobj,
4598			    int maxcount)
4599{
4600    PyObject *self;
4601    PyObject *str1;
4602    PyObject *str2;
4603    PyObject *result;
4604
4605    self = PyUnicode_FromObject(obj);
4606    if (self == NULL)
4607	return NULL;
4608    str1 = PyUnicode_FromObject(subobj);
4609    if (str1 == NULL) {
4610	Py_DECREF(self);
4611	return NULL;
4612    }
4613    str2 = PyUnicode_FromObject(replobj);
4614    if (str2 == NULL) {
4615	Py_DECREF(self);
4616	Py_DECREF(str1);
4617	return NULL;
4618    }
4619    result = replace((PyUnicodeObject *)self,
4620		     (PyUnicodeObject *)str1,
4621		     (PyUnicodeObject *)str2,
4622		     maxcount);
4623    Py_DECREF(self);
4624    Py_DECREF(str1);
4625    Py_DECREF(str2);
4626    return result;
4627}
4628
4629static char replace__doc__[] =
4630"S.replace (old, new[, maxsplit]) -> unicode\n\
4631\n\
4632Return a copy of S with all occurrences of substring\n\
4633old replaced by new.  If the optional argument maxsplit is\n\
4634given, only the first maxsplit occurrences are replaced.";
4635
4636static PyObject*
4637unicode_replace(PyUnicodeObject *self, PyObject *args)
4638{
4639    PyUnicodeObject *str1;
4640    PyUnicodeObject *str2;
4641    int maxcount = -1;
4642    PyObject *result;
4643
4644    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4645        return NULL;
4646    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4647    if (str1 == NULL)
4648	return NULL;
4649    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4650    if (str2 == NULL)
4651	return NULL;
4652
4653    result = replace(self, str1, str2, maxcount);
4654
4655    Py_DECREF(str1);
4656    Py_DECREF(str2);
4657    return result;
4658}
4659
4660static
4661PyObject *unicode_repr(PyObject *unicode)
4662{
4663    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4664				PyUnicode_GET_SIZE(unicode),
4665				1);
4666}
4667
4668static char rfind__doc__[] =
4669"S.rfind(sub [,start [,end]]) -> int\n\
4670\n\
4671Return the highest index in S where substring sub is found,\n\
4672such that sub is contained within s[start,end].  Optional\n\
4673arguments start and end are interpreted as in slice notation.\n\
4674\n\
4675Return -1 on failure.";
4676
4677static PyObject *
4678unicode_rfind(PyUnicodeObject *self, PyObject *args)
4679{
4680    PyUnicodeObject *substring;
4681    int start = 0;
4682    int end = INT_MAX;
4683    PyObject *result;
4684
4685    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4686		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4687        return NULL;
4688    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4689						(PyObject *)substring);
4690    if (substring == NULL)
4691	return NULL;
4692
4693    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4694
4695    Py_DECREF(substring);
4696    return result;
4697}
4698
4699static char rindex__doc__[] =
4700"S.rindex(sub [,start [,end]]) -> int\n\
4701\n\
4702Like S.rfind() but raise ValueError when the substring is not found.";
4703
4704static PyObject *
4705unicode_rindex(PyUnicodeObject *self, PyObject *args)
4706{
4707    int result;
4708    PyUnicodeObject *substring;
4709    int start = 0;
4710    int end = INT_MAX;
4711
4712    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4713		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4714        return NULL;
4715    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4716						(PyObject *)substring);
4717    if (substring == NULL)
4718	return NULL;
4719
4720    result = findstring(self, substring, start, end, -1);
4721
4722    Py_DECREF(substring);
4723    if (result < 0) {
4724        PyErr_SetString(PyExc_ValueError, "substring not found");
4725        return NULL;
4726    }
4727    return PyInt_FromLong(result);
4728}
4729
4730static char rjust__doc__[] =
4731"S.rjust(width) -> unicode\n\
4732\n\
4733Return S right justified in a Unicode string of length width. Padding is\n\
4734done using spaces.";
4735
4736static PyObject *
4737unicode_rjust(PyUnicodeObject *self, PyObject *args)
4738{
4739    int width;
4740    if (!PyArg_ParseTuple(args, "i:rjust", &width))
4741        return NULL;
4742
4743    if (self->length >= width && PyUnicode_CheckExact(self)) {
4744        Py_INCREF(self);
4745        return (PyObject*) self;
4746    }
4747
4748    return (PyObject*) pad(self, width - self->length, 0, ' ');
4749}
4750
4751static PyObject*
4752unicode_slice(PyUnicodeObject *self, int start, int end)
4753{
4754    /* standard clamping */
4755    if (start < 0)
4756        start = 0;
4757    if (end < 0)
4758        end = 0;
4759    if (end > self->length)
4760        end = self->length;
4761    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
4762        /* full slice, return original string */
4763        Py_INCREF(self);
4764        return (PyObject*) self;
4765    }
4766    if (start > end)
4767        start = end;
4768    /* copy slice */
4769    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4770					     end - start);
4771}
4772
4773PyObject *PyUnicode_Split(PyObject *s,
4774			  PyObject *sep,
4775			  int maxsplit)
4776{
4777    PyObject *result;
4778
4779    s = PyUnicode_FromObject(s);
4780    if (s == NULL)
4781	return NULL;
4782    if (sep != NULL) {
4783	sep = PyUnicode_FromObject(sep);
4784	if (sep == NULL) {
4785	    Py_DECREF(s);
4786	    return NULL;
4787	}
4788    }
4789
4790    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4791
4792    Py_DECREF(s);
4793    Py_XDECREF(sep);
4794    return result;
4795}
4796
4797static char split__doc__[] =
4798"S.split([sep [,maxsplit]]) -> list of strings\n\
4799\n\
4800Return a list of the words in S, using sep as the\n\
4801delimiter string.  If maxsplit is given, at most maxsplit\n\
4802splits are done. If sep is not specified, any whitespace string\n\
4803is a separator.";
4804
4805static PyObject*
4806unicode_split(PyUnicodeObject *self, PyObject *args)
4807{
4808    PyObject *substring = Py_None;
4809    int maxcount = -1;
4810
4811    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4812        return NULL;
4813
4814    if (substring == Py_None)
4815	return split(self, NULL, maxcount);
4816    else if (PyUnicode_Check(substring))
4817	return split(self, (PyUnicodeObject *)substring, maxcount);
4818    else
4819	return PyUnicode_Split((PyObject *)self, substring, maxcount);
4820}
4821
4822static char splitlines__doc__[] =
4823"S.splitlines([keepends]]) -> list of strings\n\
4824\n\
4825Return a list of the lines in S, breaking at line boundaries.\n\
4826Line breaks are not included in the resulting list unless keepends\n\
4827is given and true.";
4828
4829static PyObject*
4830unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4831{
4832    int keepends = 0;
4833
4834    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4835        return NULL;
4836
4837    return PyUnicode_Splitlines((PyObject *)self, keepends);
4838}
4839
4840static
4841PyObject *unicode_str(PyUnicodeObject *self)
4842{
4843    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4844}
4845
4846static char swapcase__doc__[] =
4847"S.swapcase() -> unicode\n\
4848\n\
4849Return a copy of S with uppercase characters converted to lowercase\n\
4850and vice versa.";
4851
4852static PyObject*
4853unicode_swapcase(PyUnicodeObject *self)
4854{
4855    return fixup(self, fixswapcase);
4856}
4857
4858static char translate__doc__[] =
4859"S.translate(table) -> unicode\n\
4860\n\
4861Return a copy of the string S, where all characters have been mapped\n\
4862through the given translation table, which must be a mapping of\n\
4863Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4864are left untouched. Characters mapped to None are deleted.";
4865
4866static PyObject*
4867unicode_translate(PyUnicodeObject *self, PyObject *table)
4868{
4869    return PyUnicode_TranslateCharmap(self->str,
4870				      self->length,
4871				      table,
4872				      "ignore");
4873}
4874
4875static char upper__doc__[] =
4876"S.upper() -> unicode\n\
4877\n\
4878Return a copy of S converted to uppercase.";
4879
4880static PyObject*
4881unicode_upper(PyUnicodeObject *self)
4882{
4883    return fixup(self, fixupper);
4884}
4885
4886static char zfill__doc__[] =
4887"S.zfill(width) -> unicode\n\
4888\n\
4889Pad a numeric string x with zeros on the left, to fill a field\n\
4890of the specified width. The string x is never truncated.";
4891
4892static PyObject *
4893unicode_zfill(PyUnicodeObject *self, PyObject *args)
4894{
4895    int fill;
4896    PyUnicodeObject *u;
4897
4898    int width;
4899    if (!PyArg_ParseTuple(args, "i:zfill", &width))
4900        return NULL;
4901
4902    if (self->length >= width) {
4903        if (PyUnicode_CheckExact(self)) {
4904            Py_INCREF(self);
4905            return (PyObject*) self;
4906        }
4907        else
4908            return PyUnicode_FromUnicode(
4909                PyUnicode_AS_UNICODE(self),
4910                PyUnicode_GET_SIZE(self)
4911            );
4912    }
4913
4914    fill = width - self->length;
4915
4916    u = pad(self, fill, 0, '0');
4917
4918    if (u == NULL)
4919        return NULL;
4920
4921    if (u->str[fill] == '+' || u->str[fill] == '-') {
4922        /* move sign to beginning of string */
4923        u->str[0] = u->str[fill];
4924        u->str[fill] = '0';
4925    }
4926
4927    return (PyObject*) u;
4928}
4929
4930#if 0
4931static PyObject*
4932unicode_freelistsize(PyUnicodeObject *self)
4933{
4934    return PyInt_FromLong(unicode_freelist_size);
4935}
4936#endif
4937
4938static char startswith__doc__[] =
4939"S.startswith(prefix[, start[, end]]) -> bool\n\
4940\n\
4941Return True if S starts with the specified prefix, False otherwise.  With\n\
4942optional start, test S beginning at that position.  With optional end, stop\n\
4943comparing S at that position.";
4944
4945static PyObject *
4946unicode_startswith(PyUnicodeObject *self,
4947		   PyObject *args)
4948{
4949    PyUnicodeObject *substring;
4950    int start = 0;
4951    int end = INT_MAX;
4952    PyObject *result;
4953
4954    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4955		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4956	return NULL;
4957    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4958						(PyObject *)substring);
4959    if (substring == NULL)
4960	return NULL;
4961
4962    result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
4963
4964    Py_DECREF(substring);
4965    return result;
4966}
4967
4968
4969static char endswith__doc__[] =
4970"S.endswith(suffix[, start[, end]]) -> bool\n\
4971\n\
4972Return True if S ends with the specified suffix, False otherwise.  With\n\
4973optional start, test S beginning at that position.  With optional end, stop\n\
4974comparing S at that position.";
4975
4976static PyObject *
4977unicode_endswith(PyUnicodeObject *self,
4978		 PyObject *args)
4979{
4980    PyUnicodeObject *substring;
4981    int start = 0;
4982    int end = INT_MAX;
4983    PyObject *result;
4984
4985    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4986		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4987	return NULL;
4988    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4989						(PyObject *)substring);
4990    if (substring == NULL)
4991	return NULL;
4992
4993    result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
4994
4995    Py_DECREF(substring);
4996    return result;
4997}
4998
4999
5000static PyMethodDef unicode_methods[] = {
5001
5002    /* Order is according to common usage: often used methods should
5003       appear first, since lookup is done sequentially. */
5004
5005    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5006    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5007    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5008    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5009    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5010    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5011    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5012    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5013    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5014    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5015    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5016    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5017    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
5018    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
5019/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5020    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5021    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5022    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
5023    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
5024    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
5025    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
5026    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5027    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5028    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5029    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5030    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5031    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5032    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5033    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5034    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5035    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5036    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5037    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5038    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5039    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
5040    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
5041#if 0
5042    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
5043#endif
5044
5045#if 0
5046    /* This one is just used for debugging the implementation. */
5047    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
5048#endif
5049
5050    {NULL, NULL}
5051};
5052
5053static PySequenceMethods unicode_as_sequence = {
5054    (inquiry) unicode_length, 		/* sq_length */
5055    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
5056    (intargfunc) unicode_repeat, 	/* sq_repeat */
5057    (intargfunc) unicode_getitem, 	/* sq_item */
5058    (intintargfunc) unicode_slice, 	/* sq_slice */
5059    0, 					/* sq_ass_item */
5060    0, 					/* sq_ass_slice */
5061    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
5062};
5063
5064static int
5065unicode_buffer_getreadbuf(PyUnicodeObject *self,
5066			  int index,
5067			  const void **ptr)
5068{
5069    if (index != 0) {
5070        PyErr_SetString(PyExc_SystemError,
5071			"accessing non-existent unicode segment");
5072        return -1;
5073    }
5074    *ptr = (void *) self->str;
5075    return PyUnicode_GET_DATA_SIZE(self);
5076}
5077
5078static int
5079unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5080			   const void **ptr)
5081{
5082    PyErr_SetString(PyExc_TypeError,
5083		    "cannot use unicode as modifyable buffer");
5084    return -1;
5085}
5086
5087static int
5088unicode_buffer_getsegcount(PyUnicodeObject *self,
5089			   int *lenp)
5090{
5091    if (lenp)
5092        *lenp = PyUnicode_GET_DATA_SIZE(self);
5093    return 1;
5094}
5095
5096static int
5097unicode_buffer_getcharbuf(PyUnicodeObject *self,
5098			  int index,
5099			  const void **ptr)
5100{
5101    PyObject *str;
5102
5103    if (index != 0) {
5104        PyErr_SetString(PyExc_SystemError,
5105			"accessing non-existent unicode segment");
5106        return -1;
5107    }
5108    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5109    if (str == NULL)
5110	return -1;
5111    *ptr = (void *) PyString_AS_STRING(str);
5112    return PyString_GET_SIZE(str);
5113}
5114
5115/* Helpers for PyUnicode_Format() */
5116
5117static PyObject *
5118getnextarg(PyObject *args, int arglen, int *p_argidx)
5119{
5120    int argidx = *p_argidx;
5121    if (argidx < arglen) {
5122	(*p_argidx)++;
5123	if (arglen < 0)
5124	    return args;
5125	else
5126	    return PyTuple_GetItem(args, argidx);
5127    }
5128    PyErr_SetString(PyExc_TypeError,
5129		    "not enough arguments for format string");
5130    return NULL;
5131}
5132
5133#define F_LJUST (1<<0)
5134#define F_SIGN	(1<<1)
5135#define F_BLANK (1<<2)
5136#define F_ALT	(1<<3)
5137#define F_ZERO	(1<<4)
5138
5139static
5140int usprintf(register Py_UNICODE *buffer, char *format, ...)
5141{
5142    register int i;
5143    int len;
5144    va_list va;
5145    char *charbuffer;
5146    va_start(va, format);
5147
5148    /* First, format the string as char array, then expand to Py_UNICODE
5149       array. */
5150    charbuffer = (char *)buffer;
5151    len = vsprintf(charbuffer, format, va);
5152    for (i = len - 1; i >= 0; i--)
5153	buffer[i] = (Py_UNICODE) charbuffer[i];
5154
5155    va_end(va);
5156    return len;
5157}
5158
5159static int
5160formatfloat(Py_UNICODE *buf,
5161	    size_t buflen,
5162	    int flags,
5163	    int prec,
5164	    int type,
5165	    PyObject *v)
5166{
5167    /* fmt = '%#.' + `prec` + `type`
5168       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5169    char fmt[20];
5170    double x;
5171
5172    x = PyFloat_AsDouble(v);
5173    if (x == -1.0 && PyErr_Occurred())
5174	return -1;
5175    if (prec < 0)
5176	prec = 6;
5177    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5178	type = 'g';
5179    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5180		  (flags & F_ALT) ? "#" : "", prec, type);
5181    /* worst case length calc to ensure no buffer overrun:
5182         fmt = %#.<prec>g
5183         buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5184            for any double rep.)
5185         len = 1 + prec + 1 + 2 + 5 = 9 + prec
5186       If prec=0 the effective precision is 1 (the leading digit is
5187       always given), therefore increase by one to 10+prec. */
5188    if (buflen <= (size_t)10 + (size_t)prec) {
5189	PyErr_SetString(PyExc_OverflowError,
5190	    "formatted float is too long (precision too long?)");
5191	return -1;
5192    }
5193    return usprintf(buf, fmt, x);
5194}
5195
5196static PyObject*
5197formatlong(PyObject *val, int flags, int prec, int type)
5198{
5199	char *buf;
5200	int i, len;
5201	PyObject *str; /* temporary string object. */
5202	PyUnicodeObject *result;
5203
5204	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5205	if (!str)
5206		return NULL;
5207	result = _PyUnicode_New(len);
5208	for (i = 0; i < len; i++)
5209		result->str[i] = buf[i];
5210	result->str[len] = 0;
5211	Py_DECREF(str);
5212	return (PyObject*)result;
5213}
5214
5215static int
5216formatint(Py_UNICODE *buf,
5217	  size_t buflen,
5218	  int flags,
5219	  int prec,
5220	  int type,
5221	  PyObject *v)
5222{
5223    /* fmt = '%#.' + `prec` + 'l' + `type`
5224     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5225     *                     + 1 + 1
5226     *                   = 24
5227     */
5228    char fmt[64]; /* plenty big enough! */
5229    long x;
5230
5231    x = PyInt_AsLong(v);
5232    if (x == -1 && PyErr_Occurred())
5233        return -1;
5234    if (prec < 0)
5235        prec = 1;
5236
5237    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5238     * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5239     */
5240    if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
5241        PyErr_SetString(PyExc_OverflowError,
5242    	        "formatted integer is too long (precision too large?)");
5243        return -1;
5244    }
5245
5246    if ((flags & F_ALT) &&
5247        (type == 'x' || type == 'X')) {
5248        /* When converting under %#x or %#X, there are a number
5249         * of issues that cause pain:
5250         * - when 0 is being converted, the C standard leaves off
5251         *   the '0x' or '0X', which is inconsistent with other
5252         *   %#x/%#X conversions and inconsistent with Python's
5253         *   hex() function
5254         * - there are platforms that violate the standard and
5255         *   convert 0 with the '0x' or '0X'
5256         *   (Metrowerks, Compaq Tru64)
5257         * - there are platforms that give '0x' when converting
5258         *   under %#X, but convert 0 in accordance with the
5259         *   standard (OS/2 EMX)
5260         *
5261         * We can achieve the desired consistency by inserting our
5262         * own '0x' or '0X' prefix, and substituting %x/%X in place
5263         * of %#x/%#X.
5264         *
5265         * Note that this is the same approach as used in
5266         * formatint() in stringobject.c
5267         */
5268        PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5269                      type, prec, type);
5270    }
5271    else {
5272        PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5273                      (flags&F_ALT) ? "#" : "",
5274                      prec, type);
5275    }
5276    return usprintf(buf, fmt, x);
5277}
5278
5279static int
5280formatchar(Py_UNICODE *buf,
5281           size_t buflen,
5282           PyObject *v)
5283{
5284    /* presume that the buffer is at least 2 characters long */
5285    if (PyUnicode_Check(v)) {
5286	if (PyUnicode_GET_SIZE(v) != 1)
5287	    goto onError;
5288	buf[0] = PyUnicode_AS_UNICODE(v)[0];
5289    }
5290
5291    else if (PyString_Check(v)) {
5292	if (PyString_GET_SIZE(v) != 1)
5293	    goto onError;
5294	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5295    }
5296
5297    else {
5298	/* Integer input truncated to a character */
5299        long x;
5300	x = PyInt_AsLong(v);
5301	if (x == -1 && PyErr_Occurred())
5302	    goto onError;
5303	buf[0] = (char) x;
5304    }
5305    buf[1] = '\0';
5306    return 1;
5307
5308 onError:
5309    PyErr_SetString(PyExc_TypeError,
5310		    "%c requires int or char");
5311    return -1;
5312}
5313
5314/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5315
5316   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5317   chars are formatted. XXX This is a magic number. Each formatting
5318   routine does bounds checking to ensure no overflow, but a better
5319   solution may be to malloc a buffer of appropriate size for each
5320   format. For now, the current solution is sufficient.
5321*/
5322#define FORMATBUFLEN (size_t)120
5323
5324PyObject *PyUnicode_Format(PyObject *format,
5325			   PyObject *args)
5326{
5327    Py_UNICODE *fmt, *res;
5328    int fmtcnt, rescnt, reslen, arglen, argidx;
5329    int args_owned = 0;
5330    PyUnicodeObject *result = NULL;
5331    PyObject *dict = NULL;
5332    PyObject *uformat;
5333
5334    if (format == NULL || args == NULL) {
5335	PyErr_BadInternalCall();
5336	return NULL;
5337    }
5338    uformat = PyUnicode_FromObject(format);
5339    if (uformat == NULL)
5340	return NULL;
5341    fmt = PyUnicode_AS_UNICODE(uformat);
5342    fmtcnt = PyUnicode_GET_SIZE(uformat);
5343
5344    reslen = rescnt = fmtcnt + 100;
5345    result = _PyUnicode_New(reslen);
5346    if (result == NULL)
5347	goto onError;
5348    res = PyUnicode_AS_UNICODE(result);
5349
5350    if (PyTuple_Check(args)) {
5351	arglen = PyTuple_Size(args);
5352	argidx = 0;
5353    }
5354    else {
5355	arglen = -1;
5356	argidx = -2;
5357    }
5358    if (args->ob_type->tp_as_mapping)
5359	dict = args;
5360
5361    while (--fmtcnt >= 0) {
5362	if (*fmt != '%') {
5363	    if (--rescnt < 0) {
5364		rescnt = fmtcnt + 100;
5365		reslen += rescnt;
5366		if (_PyUnicode_Resize(&result, reslen) < 0)
5367		    return NULL;
5368		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5369		--rescnt;
5370	    }
5371	    *res++ = *fmt++;
5372	}
5373	else {
5374	    /* Got a format specifier */
5375	    int flags = 0;
5376	    int width = -1;
5377	    int prec = -1;
5378	    Py_UNICODE c = '\0';
5379	    Py_UNICODE fill;
5380	    PyObject *v = NULL;
5381	    PyObject *temp = NULL;
5382	    Py_UNICODE *pbuf;
5383	    Py_UNICODE sign;
5384	    int len;
5385	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5386
5387	    fmt++;
5388	    if (*fmt == '(') {
5389		Py_UNICODE *keystart;
5390		int keylen;
5391		PyObject *key;
5392		int pcount = 1;
5393
5394		if (dict == NULL) {
5395		    PyErr_SetString(PyExc_TypeError,
5396				    "format requires a mapping");
5397		    goto onError;
5398		}
5399		++fmt;
5400		--fmtcnt;
5401		keystart = fmt;
5402		/* Skip over balanced parentheses */
5403		while (pcount > 0 && --fmtcnt >= 0) {
5404		    if (*fmt == ')')
5405			--pcount;
5406		    else if (*fmt == '(')
5407			++pcount;
5408		    fmt++;
5409		}
5410		keylen = fmt - keystart - 1;
5411		if (fmtcnt < 0 || pcount > 0) {
5412		    PyErr_SetString(PyExc_ValueError,
5413				    "incomplete format key");
5414		    goto onError;
5415		}
5416#if 0
5417		/* keys are converted to strings using UTF-8 and
5418		   then looked up since Python uses strings to hold
5419		   variables names etc. in its namespaces and we
5420		   wouldn't want to break common idioms. */
5421		key = PyUnicode_EncodeUTF8(keystart,
5422					   keylen,
5423					   NULL);
5424#else
5425		key = PyUnicode_FromUnicode(keystart, keylen);
5426#endif
5427		if (key == NULL)
5428		    goto onError;
5429		if (args_owned) {
5430		    Py_DECREF(args);
5431		    args_owned = 0;
5432		}
5433		args = PyObject_GetItem(dict, key);
5434		Py_DECREF(key);
5435		if (args == NULL) {
5436		    goto onError;
5437		}
5438		args_owned = 1;
5439		arglen = -1;
5440		argidx = -2;
5441	    }
5442	    while (--fmtcnt >= 0) {
5443		switch (c = *fmt++) {
5444		case '-': flags |= F_LJUST; continue;
5445		case '+': flags |= F_SIGN; continue;
5446		case ' ': flags |= F_BLANK; continue;
5447		case '#': flags |= F_ALT; continue;
5448		case '0': flags |= F_ZERO; continue;
5449		}
5450		break;
5451	    }
5452	    if (c == '*') {
5453		v = getnextarg(args, arglen, &argidx);
5454		if (v == NULL)
5455		    goto onError;
5456		if (!PyInt_Check(v)) {
5457		    PyErr_SetString(PyExc_TypeError,
5458				    "* wants int");
5459		    goto onError;
5460		}
5461		width = PyInt_AsLong(v);
5462		if (width < 0) {
5463		    flags |= F_LJUST;
5464		    width = -width;
5465		}
5466		if (--fmtcnt >= 0)
5467		    c = *fmt++;
5468	    }
5469	    else if (c >= '0' && c <= '9') {
5470		width = c - '0';
5471		while (--fmtcnt >= 0) {
5472		    c = *fmt++;
5473		    if (c < '0' || c > '9')
5474			break;
5475		    if ((width*10) / 10 != width) {
5476			PyErr_SetString(PyExc_ValueError,
5477					"width too big");
5478			goto onError;
5479		    }
5480		    width = width*10 + (c - '0');
5481		}
5482	    }
5483	    if (c == '.') {
5484		prec = 0;
5485		if (--fmtcnt >= 0)
5486		    c = *fmt++;
5487		if (c == '*') {
5488		    v = getnextarg(args, arglen, &argidx);
5489		    if (v == NULL)
5490			goto onError;
5491		    if (!PyInt_Check(v)) {
5492			PyErr_SetString(PyExc_TypeError,
5493					"* wants int");
5494			goto onError;
5495		    }
5496		    prec = PyInt_AsLong(v);
5497		    if (prec < 0)
5498			prec = 0;
5499		    if (--fmtcnt >= 0)
5500			c = *fmt++;
5501		}
5502		else if (c >= '0' && c <= '9') {
5503		    prec = c - '0';
5504		    while (--fmtcnt >= 0) {
5505			c = Py_CHARMASK(*fmt++);
5506			if (c < '0' || c > '9')
5507			    break;
5508			if ((prec*10) / 10 != prec) {
5509			    PyErr_SetString(PyExc_ValueError,
5510					    "prec too big");
5511			    goto onError;
5512			}
5513			prec = prec*10 + (c - '0');
5514		    }
5515		}
5516	    } /* prec */
5517	    if (fmtcnt >= 0) {
5518		if (c == 'h' || c == 'l' || c == 'L') {
5519		    if (--fmtcnt >= 0)
5520			c = *fmt++;
5521		}
5522	    }
5523	    if (fmtcnt < 0) {
5524		PyErr_SetString(PyExc_ValueError,
5525				"incomplete format");
5526		goto onError;
5527	    }
5528	    if (c != '%') {
5529		v = getnextarg(args, arglen, &argidx);
5530		if (v == NULL)
5531		    goto onError;
5532	    }
5533	    sign = 0;
5534	    fill = ' ';
5535	    switch (c) {
5536
5537	    case '%':
5538		pbuf = formatbuf;
5539		/* presume that buffer length is at least 1 */
5540		pbuf[0] = '%';
5541		len = 1;
5542		break;
5543
5544	    case 's':
5545	    case 'r':
5546		if (PyUnicode_Check(v) && c == 's') {
5547		    temp = v;
5548		    Py_INCREF(temp);
5549		}
5550		else {
5551		    PyObject *unicode;
5552		    if (c == 's')
5553			temp = PyObject_Str(v);
5554		    else
5555			temp = PyObject_Repr(v);
5556		    if (temp == NULL)
5557			goto onError;
5558		    if (!PyString_Check(temp)) {
5559			/* XXX Note: this should never happen, since
5560   			       PyObject_Repr() and PyObject_Str() assure
5561			       this */
5562			Py_DECREF(temp);
5563			PyErr_SetString(PyExc_TypeError,
5564					"%s argument has non-string str()");
5565			goto onError;
5566		    }
5567		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5568						   PyString_GET_SIZE(temp),
5569					       NULL,
5570						   "strict");
5571		    Py_DECREF(temp);
5572		    temp = unicode;
5573		    if (temp == NULL)
5574			goto onError;
5575		}
5576		pbuf = PyUnicode_AS_UNICODE(temp);
5577		len = PyUnicode_GET_SIZE(temp);
5578		if (prec >= 0 && len > prec)
5579		    len = prec;
5580		break;
5581
5582	    case 'i':
5583	    case 'd':
5584	    case 'u':
5585	    case 'o':
5586	    case 'x':
5587	    case 'X':
5588		if (c == 'i')
5589		    c = 'd';
5590		if (PyLong_Check(v)) {
5591		    temp = formatlong(v, flags, prec, c);
5592		    if (!temp)
5593			goto onError;
5594		    pbuf = PyUnicode_AS_UNICODE(temp);
5595		    len = PyUnicode_GET_SIZE(temp);
5596		    /* unbounded ints can always produce
5597		       a sign character! */
5598		    sign = 1;
5599		}
5600		else {
5601		    pbuf = formatbuf;
5602		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5603				    flags, prec, c, v);
5604		    if (len < 0)
5605			goto onError;
5606		    /* only d conversion is signed */
5607		    sign = c == 'd';
5608		}
5609		if (flags & F_ZERO)
5610		    fill = '0';
5611		break;
5612
5613	    case 'e':
5614	    case 'E':
5615	    case 'f':
5616	    case 'g':
5617	    case 'G':
5618		pbuf = formatbuf;
5619		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5620			flags, prec, c, v);
5621		if (len < 0)
5622		    goto onError;
5623		sign = 1;
5624		if (flags & F_ZERO)
5625		    fill = '0';
5626		break;
5627
5628	    case 'c':
5629		pbuf = formatbuf;
5630		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5631		if (len < 0)
5632		    goto onError;
5633		break;
5634
5635	    default:
5636		PyErr_Format(PyExc_ValueError,
5637			     "unsupported format character '%c' (0x%x) "
5638			     "at index %i",
5639			     (31<=c && c<=126) ? c : '?',
5640                             c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5641		goto onError;
5642	    }
5643	    if (sign) {
5644		if (*pbuf == '-' || *pbuf == '+') {
5645		    sign = *pbuf++;
5646		    len--;
5647		}
5648		else if (flags & F_SIGN)
5649		    sign = '+';
5650		else if (flags & F_BLANK)
5651		    sign = ' ';
5652		else
5653		    sign = 0;
5654	    }
5655	    if (width < len)
5656		width = len;
5657	    if (rescnt < width + (sign != 0)) {
5658		reslen -= rescnt;
5659		rescnt = width + fmtcnt + 100;
5660		reslen += rescnt;
5661		if (_PyUnicode_Resize(&result, reslen) < 0)
5662		    return NULL;
5663		res = PyUnicode_AS_UNICODE(result)
5664		    + reslen - rescnt;
5665	    }
5666	    if (sign) {
5667		if (fill != ' ')
5668		    *res++ = sign;
5669		rescnt--;
5670		if (width > len)
5671		    width--;
5672	    }
5673	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5674		assert(pbuf[0] == '0');
5675		assert(pbuf[1] == c);
5676		if (fill != ' ') {
5677		    *res++ = *pbuf++;
5678		    *res++ = *pbuf++;
5679		}
5680		rescnt -= 2;
5681		width -= 2;
5682		if (width < 0)
5683		    width = 0;
5684		len -= 2;
5685	    }
5686	    if (width > len && !(flags & F_LJUST)) {
5687		do {
5688		    --rescnt;
5689		    *res++ = fill;
5690		} while (--width > len);
5691	    }
5692	    if (fill == ' ') {
5693		if (sign)
5694		    *res++ = sign;
5695		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5696		    assert(pbuf[0] == '0');
5697		    assert(pbuf[1] == c);
5698		    *res++ = *pbuf++;
5699		    *res++ = *pbuf++;
5700		}
5701	    }
5702	    Py_UNICODE_COPY(res, pbuf, len);
5703	    res += len;
5704	    rescnt -= len;
5705	    while (--width >= len) {
5706		--rescnt;
5707		*res++ = ' ';
5708	    }
5709	    if (dict && (argidx < arglen) && c != '%') {
5710		PyErr_SetString(PyExc_TypeError,
5711				"not all arguments converted during string formatting");
5712		goto onError;
5713	    }
5714	    Py_XDECREF(temp);
5715	} /* '%' */
5716    } /* until end */
5717    if (argidx < arglen && !dict) {
5718	PyErr_SetString(PyExc_TypeError,
5719			"not all arguments converted during string formatting");
5720	goto onError;
5721    }
5722
5723    if (args_owned) {
5724	Py_DECREF(args);
5725    }
5726    Py_DECREF(uformat);
5727    if (_PyUnicode_Resize(&result, reslen - rescnt))
5728	goto onError;
5729    return (PyObject *)result;
5730
5731 onError:
5732    Py_XDECREF(result);
5733    Py_DECREF(uformat);
5734    if (args_owned) {
5735	Py_DECREF(args);
5736    }
5737    return NULL;
5738}
5739
5740static PyBufferProcs unicode_as_buffer = {
5741    (getreadbufferproc) unicode_buffer_getreadbuf,
5742    (getwritebufferproc) unicode_buffer_getwritebuf,
5743    (getsegcountproc) unicode_buffer_getsegcount,
5744    (getcharbufferproc) unicode_buffer_getcharbuf,
5745};
5746
5747staticforward PyObject *
5748unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5749
5750static PyObject *
5751unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5752{
5753        PyObject *x = NULL;
5754	static char *kwlist[] = {"string", "encoding", "errors", 0};
5755	char *encoding = NULL;
5756	char *errors = NULL;
5757
5758	if (type != &PyUnicode_Type)
5759		return unicode_subtype_new(type, args, kwds);
5760	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5761					  kwlist, &x, &encoding, &errors))
5762	    return NULL;
5763	if (x == NULL)
5764		return (PyObject *)_PyUnicode_New(0);
5765	if (encoding == NULL && errors == NULL)
5766	    return PyObject_Unicode(x);
5767	else
5768	return PyUnicode_FromEncodedObject(x, encoding, errors);
5769}
5770
5771static PyObject *
5772unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5773{
5774	PyUnicodeObject *tmp, *pnew;
5775	int n;
5776
5777	assert(PyType_IsSubtype(type, &PyUnicode_Type));
5778	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5779	if (tmp == NULL)
5780		return NULL;
5781	assert(PyUnicode_Check(tmp));
5782	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5783	if (pnew == NULL)
5784		return NULL;
5785	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5786	if (pnew->str == NULL) {
5787		_Py_ForgetReference((PyObject *)pnew);
5788		PyObject_Del(pnew);
5789		return NULL;
5790	}
5791	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5792	pnew->length = n;
5793	pnew->hash = tmp->hash;
5794	Py_DECREF(tmp);
5795	return (PyObject *)pnew;
5796}
5797
5798static char unicode_doc[] =
5799"unicode(string [, encoding[, errors]]) -> object\n\
5800\n\
5801Create a new Unicode object from the given encoded string.\n\
5802encoding defaults to the current default string encoding and \n\
5803errors, defining the error handling, to 'strict'.";
5804
5805PyTypeObject PyUnicode_Type = {
5806    PyObject_HEAD_INIT(&PyType_Type)
5807    0, 					/* ob_size */
5808    "unicode", 				/* tp_name */
5809    sizeof(PyUnicodeObject), 		/* tp_size */
5810    0, 					/* tp_itemsize */
5811    /* Slots */
5812    (destructor)unicode_dealloc, 	/* tp_dealloc */
5813    0, 					/* tp_print */
5814    0,				 	/* tp_getattr */
5815    0, 					/* tp_setattr */
5816    (cmpfunc) unicode_compare, 		/* tp_compare */
5817    (reprfunc) unicode_repr, 		/* tp_repr */
5818    0, 					/* tp_as_number */
5819    &unicode_as_sequence, 		/* tp_as_sequence */
5820    0, 					/* tp_as_mapping */
5821    (hashfunc) unicode_hash, 		/* tp_hash*/
5822    0, 					/* tp_call*/
5823    (reprfunc) unicode_str,	 	/* tp_str */
5824    PyObject_GenericGetAttr, 		/* tp_getattro */
5825    0,			 		/* tp_setattro */
5826    &unicode_as_buffer,			/* tp_as_buffer */
5827    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
5828    unicode_doc,			/* tp_doc */
5829    0,					/* tp_traverse */
5830    0,					/* tp_clear */
5831    0,					/* tp_richcompare */
5832    0,					/* tp_weaklistoffset */
5833    0,					/* tp_iter */
5834    0,					/* tp_iternext */
5835    unicode_methods,			/* tp_methods */
5836    0,					/* tp_members */
5837    0,					/* tp_getset */
5838    0,					/* tp_base */
5839    0,					/* tp_dict */
5840    0,					/* tp_descr_get */
5841    0,					/* tp_descr_set */
5842    0,					/* tp_dictoffset */
5843    0,					/* tp_init */
5844    0,					/* tp_alloc */
5845    unicode_new,			/* tp_new */
5846    PyObject_Del,      		/* tp_free */
5847};
5848
5849/* Initialize the Unicode implementation */
5850
5851void _PyUnicode_Init(void)
5852{
5853    int i;
5854
5855    /* Init the implementation */
5856    unicode_freelist = NULL;
5857    unicode_freelist_size = 0;
5858    unicode_empty = _PyUnicode_New(0);
5859    strcpy(unicode_default_encoding, "ascii");
5860    for (i = 0; i < 256; i++)
5861	unicode_latin1[i] = NULL;
5862}
5863
5864/* Finalize the Unicode implementation */
5865
5866void
5867_PyUnicode_Fini(void)
5868{
5869    PyUnicodeObject *u;
5870    int i;
5871
5872    Py_XDECREF(unicode_empty);
5873    unicode_empty = NULL;
5874
5875    for (i = 0; i < 256; i++) {
5876	if (unicode_latin1[i]) {
5877	    Py_DECREF(unicode_latin1[i]);
5878	    unicode_latin1[i] = NULL;
5879	}
5880    }
5881
5882    for (u = unicode_freelist; u != NULL;) {
5883	PyUnicodeObject *v = u;
5884	u = *(PyUnicodeObject **)u;
5885	if (v->str)
5886	    PyMem_DEL(v->str);
5887	Py_XDECREF(v->defenc);
5888	PyObject_Del(v);
5889    }
5890    unicode_freelist = NULL;
5891    unicode_freelist_size = 0;
5892}
5893