unicodeobject.c revision ce4dc41b1a2be5b5335bcbc0865b145852a5c0e5
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python.  This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters.  End
17 * of string is given by the length attribute.  However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl  Created
23 * 1999-01-24 fl  Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl  Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl  Moved declarations to separate file, etc.
26 * 1999-06-13 fl  Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl  Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "unicodeobject.h"
68#include "ucnhash.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
76#ifdef MS_WIN32
77#include <windows.h>
78#endif
79
80/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE       1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86   The implementation will keep allocated Unicode memory intact for
87   all objects on the free list having a size less than this
88   limit. This reduces malloc() overhead for small Unicode objects.
89
90   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
91   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
92   malloc()-overhead) bytes of unused garbage.
93
94   Setting the limit to 0 effectively turns the feature off.
95
96   Note: This is an experimental feature ! If you get core dumps when
97   using Unicode objects, turn this feature off.
98
99*/
100
101#define KEEPALIVE_SIZE_LIMIT       9
102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
111/* --- Globals ------------------------------------------------------------
112
113   The globals are initialized by the _PyUnicode_Init() API and should
114   not be used before calling that API.
115
116*/
117
118/* The empty Unicode object */
119static PyUnicodeObject *unicode_empty;
120
121/* Free list for Unicode objects */
122static PyUnicodeObject *unicode_freelist;
123static int unicode_freelist_size;
124
125/* Default encoding to use and assume when NULL is passed as encoding
126   parameter; it is initialized by _PyUnicode_Init().
127
128   Always use the PyUnicode_SetDefaultEncoding() and
129   PyUnicode_GetDefaultEncoding() APIs to access this global.
130
131*/
132
133static char unicode_default_encoding[100];
134
135/* --- Unicode Object ----------------------------------------------------- */
136
137static
138int _PyUnicode_Resize(register PyUnicodeObject *unicode,
139                      int length)
140{
141    void *oldstr;
142
143    /* Shortcut if there's nothing much to do. */
144    if (unicode->length == length)
145	goto reset;
146
147    /* Resizing unicode_empty is not allowed. */
148    if (unicode == unicode_empty) {
149        PyErr_SetString(PyExc_SystemError,
150                        "can't resize empty unicode object");
151        return -1;
152    }
153
154    /* We allocate one more byte to make sure the string is
155       Ux0000 terminated -- XXX is this needed ? */
156    oldstr = unicode->str;
157    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
158    if (!unicode->str) {
159	unicode->str = oldstr;
160        PyErr_NoMemory();
161        return -1;
162    }
163    unicode->str[length] = 0;
164    unicode->length = length;
165
166 reset:
167    /* Reset the object caches */
168    if (unicode->defenc) {
169        Py_DECREF(unicode->defenc);
170        unicode->defenc = NULL;
171    }
172    unicode->hash = -1;
173
174    return 0;
175}
176
177int PyUnicode_Resize(PyObject **unicode,
178		     int length)
179{
180    PyUnicodeObject *v;
181
182    if (unicode == NULL) {
183	PyErr_BadInternalCall();
184	return -1;
185    }
186    v = (PyUnicodeObject *)*unicode;
187    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
188	PyErr_BadInternalCall();
189	return -1;
190    }
191    return _PyUnicode_Resize(v, length);
192}
193
194/* We allocate one more byte to make sure the string is
195   Ux0000 terminated -- XXX is this needed ?
196
197   XXX This allocator could further be enhanced by assuring that the
198       free list never reduces its size below 1.
199
200*/
201
202static
203PyUnicodeObject *_PyUnicode_New(int length)
204{
205    register PyUnicodeObject *unicode;
206
207    /* Optimization for empty strings */
208    if (length == 0 && unicode_empty != NULL) {
209        Py_INCREF(unicode_empty);
210        return unicode_empty;
211    }
212
213    /* Unicode freelist & memory allocation */
214    if (unicode_freelist) {
215        unicode = unicode_freelist;
216        unicode_freelist = *(PyUnicodeObject **)unicode;
217        unicode_freelist_size--;
218	if (unicode->str) {
219	    /* Keep-Alive optimization: we only upsize the buffer,
220	       never downsize it. */
221	    if ((unicode->length < length) &&
222		_PyUnicode_Resize(unicode, length)) {
223		PyMem_DEL(unicode->str);
224		goto onError;
225	    }
226	}
227      else {
228	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
229      }
230      PyObject_INIT(unicode, &PyUnicode_Type);
231    }
232    else {
233        unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
234        if (unicode == NULL)
235            return NULL;
236	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
237    }
238
239    if (!unicode->str) {
240	PyErr_NoMemory();
241	goto onError;
242    }
243    unicode->str[length] = 0;
244    unicode->length = length;
245    unicode->hash = -1;
246    unicode->defenc = NULL;
247    return unicode;
248
249 onError:
250    _Py_ForgetReference((PyObject *)unicode);
251    PyObject_DEL(unicode);
252    return NULL;
253}
254
255static
256void _PyUnicode_Free(register PyUnicodeObject *unicode)
257{
258    if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
259        /* Keep-Alive optimization */
260	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
261	    PyMem_DEL(unicode->str);
262	    unicode->str = NULL;
263	    unicode->length = 0;
264	}
265	if (unicode->defenc) {
266	    Py_DECREF(unicode->defenc);
267	    unicode->defenc = NULL;
268	}
269	/* Add to free list */
270        *(PyUnicodeObject **)unicode = unicode_freelist;
271        unicode_freelist = unicode;
272        unicode_freelist_size++;
273    }
274    else {
275	PyMem_DEL(unicode->str);
276	Py_XDECREF(unicode->defenc);
277	PyObject_DEL(unicode);
278    }
279}
280
281PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
282				int size)
283{
284    PyUnicodeObject *unicode;
285
286    unicode = _PyUnicode_New(size);
287    if (!unicode)
288        return NULL;
289
290    /* Copy the Unicode data into the new object */
291    if (u != NULL)
292	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
293
294    return (PyObject *)unicode;
295}
296
297#ifdef HAVE_WCHAR_H
298
299PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
300				 int size)
301{
302    PyUnicodeObject *unicode;
303
304    if (w == NULL) {
305	PyErr_BadInternalCall();
306	return NULL;
307    }
308
309    unicode = _PyUnicode_New(size);
310    if (!unicode)
311        return NULL;
312
313    /* Copy the wchar_t data into the new object */
314#ifdef HAVE_USABLE_WCHAR_T
315    memcpy(unicode->str, w, size * sizeof(wchar_t));
316#else
317    {
318	register Py_UNICODE *u;
319	register int i;
320	u = PyUnicode_AS_UNICODE(unicode);
321	for (i = size; i >= 0; i--)
322	    *u++ = *w++;
323    }
324#endif
325
326    return (PyObject *)unicode;
327}
328
329int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
330			 register wchar_t *w,
331			 int size)
332{
333    if (unicode == NULL) {
334	PyErr_BadInternalCall();
335	return -1;
336    }
337    if (size > PyUnicode_GET_SIZE(unicode))
338	size = PyUnicode_GET_SIZE(unicode);
339#ifdef HAVE_USABLE_WCHAR_T
340    memcpy(w, unicode->str, size * sizeof(wchar_t));
341#else
342    {
343	register Py_UNICODE *u;
344	register int i;
345	u = PyUnicode_AS_UNICODE(unicode);
346	for (i = size; i >= 0; i--)
347	    *w++ = *u++;
348    }
349#endif
350
351    return size;
352}
353
354#endif
355
356PyObject *PyUnicode_FromObject(register PyObject *obj)
357{
358    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
359}
360
361PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
362				      const char *encoding,
363				      const char *errors)
364{
365    const char *s;
366    int len;
367    int owned = 0;
368    PyObject *v;
369
370    if (obj == NULL) {
371	PyErr_BadInternalCall();
372	return NULL;
373    }
374
375    /* Coerce object */
376    if (PyInstance_Check(obj)) {
377	PyObject *func;
378	func = PyObject_GetAttrString(obj, "__str__");
379	if (func == NULL) {
380	    PyErr_SetString(PyExc_TypeError,
381		  "coercing to Unicode: instance doesn't define __str__");
382	    return NULL;
383	}
384	obj = PyEval_CallObject(func, NULL);
385	Py_DECREF(func);
386	if (obj == NULL)
387	    return NULL;
388	owned = 1;
389    }
390    if (PyUnicode_Check(obj)) {
391	Py_INCREF(obj);
392	v = obj;
393	if (encoding) {
394	    PyErr_SetString(PyExc_TypeError,
395			    "decoding Unicode is not supported");
396	    return NULL;
397	}
398	goto done;
399    }
400    else if (PyString_Check(obj)) {
401	s = PyString_AS_STRING(obj);
402	len = PyString_GET_SIZE(obj);
403    }
404    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
405	/* Overwrite the error message with something more useful in
406	   case of a TypeError. */
407	if (PyErr_ExceptionMatches(PyExc_TypeError))
408	    PyErr_Format(PyExc_TypeError,
409			 "coercing to Unicode: need string or buffer, "
410			 "%.80s found",
411			 obj->ob_type->tp_name);
412	goto onError;
413    }
414
415    /* Convert to Unicode */
416    if (len == 0) {
417	Py_INCREF(unicode_empty);
418	v = (PyObject *)unicode_empty;
419    }
420    else
421	v = PyUnicode_Decode(s, len, encoding, errors);
422 done:
423    if (owned) {
424	Py_DECREF(obj);
425    }
426    return v;
427
428 onError:
429    if (owned) {
430	Py_DECREF(obj);
431    }
432    return NULL;
433}
434
435PyObject *PyUnicode_Decode(const char *s,
436			   int size,
437			   const char *encoding,
438			   const char *errors)
439{
440    PyObject *buffer = NULL, *unicode;
441
442    if (encoding == NULL)
443	encoding = PyUnicode_GetDefaultEncoding();
444
445    /* Shortcuts for common default encodings */
446    if (strcmp(encoding, "utf-8") == 0)
447        return PyUnicode_DecodeUTF8(s, size, errors);
448    else if (strcmp(encoding, "latin-1") == 0)
449        return PyUnicode_DecodeLatin1(s, size, errors);
450    else if (strcmp(encoding, "ascii") == 0)
451        return PyUnicode_DecodeASCII(s, size, errors);
452
453    /* Decode via the codec registry */
454    buffer = PyBuffer_FromMemory((void *)s, size);
455    if (buffer == NULL)
456        goto onError;
457    unicode = PyCodec_Decode(buffer, encoding, errors);
458    if (unicode == NULL)
459        goto onError;
460    if (!PyUnicode_Check(unicode)) {
461        PyErr_Format(PyExc_TypeError,
462                     "decoder did not return an unicode object (type=%.400s)",
463                     unicode->ob_type->tp_name);
464        Py_DECREF(unicode);
465        goto onError;
466    }
467    Py_DECREF(buffer);
468    return unicode;
469
470 onError:
471    Py_XDECREF(buffer);
472    return NULL;
473}
474
475PyObject *PyUnicode_Encode(const Py_UNICODE *s,
476			   int size,
477			   const char *encoding,
478			   const char *errors)
479{
480    PyObject *v, *unicode;
481
482    unicode = PyUnicode_FromUnicode(s, size);
483    if (unicode == NULL)
484	return NULL;
485    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
486    Py_DECREF(unicode);
487    return v;
488}
489
490PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
491                                    const char *encoding,
492                                    const char *errors)
493{
494    PyObject *v;
495
496    if (!PyUnicode_Check(unicode)) {
497        PyErr_BadArgument();
498        goto onError;
499    }
500
501    if (encoding == NULL)
502	encoding = PyUnicode_GetDefaultEncoding();
503
504    /* Shortcuts for common default encodings */
505    if (errors == NULL) {
506	if (strcmp(encoding, "utf-8") == 0)
507        return PyUnicode_AsUTF8String(unicode);
508	else if (strcmp(encoding, "latin-1") == 0)
509	    return PyUnicode_AsLatin1String(unicode);
510	else if (strcmp(encoding, "ascii") == 0)
511	    return PyUnicode_AsASCIIString(unicode);
512    }
513
514    /* Encode via the codec registry */
515    v = PyCodec_Encode(unicode, encoding, errors);
516    if (v == NULL)
517        goto onError;
518    /* XXX Should we really enforce this ? */
519    if (!PyString_Check(v)) {
520        PyErr_Format(PyExc_TypeError,
521                     "encoder did not return a string object (type=%.400s)",
522                     v->ob_type->tp_name);
523        Py_DECREF(v);
524        goto onError;
525    }
526    return v;
527
528 onError:
529    return NULL;
530}
531
532/* Return a Python string holding the default encoded value of the
533   Unicode object.
534
535   The resulting string is cached in the Unicode object for subsequent
536   usage by this function. The cached version is needed to implement
537   the character buffer interface and will live (at least) as long as
538   the Unicode object itself.
539
540   The refcount of the string is *not* incremented.
541
542   *** Exported for internal use by the interpreter only !!! ***
543
544*/
545
546PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
547					    const char *errors)
548{
549    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
550
551    if (v)
552        return v;
553    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
554    if (v && errors == NULL)
555        ((PyUnicodeObject *)unicode)->defenc = v;
556    return v;
557}
558
559Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
560{
561    if (!PyUnicode_Check(unicode)) {
562        PyErr_BadArgument();
563        goto onError;
564    }
565    return PyUnicode_AS_UNICODE(unicode);
566
567 onError:
568    return NULL;
569}
570
571int PyUnicode_GetSize(PyObject *unicode)
572{
573    if (!PyUnicode_Check(unicode)) {
574        PyErr_BadArgument();
575        goto onError;
576    }
577    return PyUnicode_GET_SIZE(unicode);
578
579 onError:
580    return -1;
581}
582
583const char *PyUnicode_GetDefaultEncoding(void)
584{
585    return unicode_default_encoding;
586}
587
588int PyUnicode_SetDefaultEncoding(const char *encoding)
589{
590    PyObject *v;
591
592    /* Make sure the encoding is valid. As side effect, this also
593       loads the encoding into the codec registry cache. */
594    v = _PyCodec_Lookup(encoding);
595    if (v == NULL)
596	goto onError;
597    Py_DECREF(v);
598    strncpy(unicode_default_encoding,
599	    encoding,
600	    sizeof(unicode_default_encoding));
601    return 0;
602
603 onError:
604    return -1;
605}
606
607/* --- UTF-8 Codec -------------------------------------------------------- */
608
609static
610char utf8_code_length[256] = {
611    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
612       illegal prefix.  see RFC 2279 for details */
613    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
616    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
617    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
618    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
619    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
620    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
621    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
626    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
627    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
629};
630
631static
632int utf8_decoding_error(const char **source,
633                        Py_UNICODE **dest,
634                        const char *errors,
635                        const char *details)
636{
637    if ((errors == NULL) ||
638        (strcmp(errors,"strict") == 0)) {
639        PyErr_Format(PyExc_UnicodeError,
640                     "UTF-8 decoding error: %.400s",
641                     details);
642        return -1;
643    }
644    else if (strcmp(errors,"ignore") == 0) {
645        (*source)++;
646        return 0;
647    }
648    else if (strcmp(errors,"replace") == 0) {
649        (*source)++;
650        **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
651        (*dest)++;
652        return 0;
653    }
654    else {
655        PyErr_Format(PyExc_ValueError,
656                     "UTF-8 decoding error; unknown error handling code: %.400s",
657                     errors);
658        return -1;
659    }
660}
661
662PyObject *PyUnicode_DecodeUTF8(const char *s,
663			       int size,
664			       const char *errors)
665{
666    int n;
667    const char *e;
668    PyUnicodeObject *unicode;
669    Py_UNICODE *p;
670    const char *errmsg = "";
671
672    /* Note: size will always be longer than the resulting Unicode
673       character count */
674    unicode = _PyUnicode_New(size);
675    if (!unicode)
676        return NULL;
677    if (size == 0)
678        return (PyObject *)unicode;
679
680    /* Unpack UTF-8 encoded data */
681    p = unicode->str;
682    e = s + size;
683
684    while (s < e) {
685        Py_UCS4 ch = (unsigned char)*s;
686
687        if (ch < 0x80) {
688            *p++ = (Py_UNICODE)ch;
689            s++;
690            continue;
691        }
692
693        n = utf8_code_length[ch];
694
695        if (s + n > e) {
696	    errmsg = "unexpected end of data";
697	    goto utf8Error;
698	}
699
700        switch (n) {
701
702        case 0:
703            errmsg = "unexpected code byte";
704	    goto utf8Error;
705            break;
706
707        case 1:
708            errmsg = "internal error";
709	    goto utf8Error;
710            break;
711
712        case 2:
713            if ((s[1] & 0xc0) != 0x80) {
714                errmsg = "invalid data";
715		goto utf8Error;
716	    }
717            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
718            if (ch < 0x80) {
719                errmsg = "illegal encoding";
720		goto utf8Error;
721	    }
722	    else
723		*p++ = (Py_UNICODE)ch;
724            break;
725
726        case 3:
727            if ((s[1] & 0xc0) != 0x80 ||
728                (s[2] & 0xc0) != 0x80) {
729                errmsg = "invalid data";
730		goto utf8Error;
731	    }
732            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
733            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
734                errmsg = "illegal encoding";
735		goto utf8Error;
736	    }
737	    else
738				*p++ = (Py_UNICODE)ch;
739            break;
740
741        case 4:
742            if ((s[1] & 0xc0) != 0x80 ||
743                (s[2] & 0xc0) != 0x80 ||
744                (s[3] & 0xc0) != 0x80) {
745                errmsg = "invalid data";
746		goto utf8Error;
747	    }
748            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
749                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
750            /* validate and convert to UTF-16 */
751            if ((ch < 0x10000) ||   /* minimum value allowed for 4
752                                       byte encoding */
753                (ch > 0x10ffff)) {  /* maximum value allowed for
754                                       UTF-16 */
755                errmsg = "illegal encoding";
756		goto utf8Error;
757	    }
758            /*  compute and append the two surrogates: */
759
760            /*  translate from 10000..10FFFF to 0..FFFF */
761            ch -= 0x10000;
762
763            /*  high surrogate = top 10 bits added to D800 */
764            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
765
766            /*  low surrogate = bottom 10 bits added to DC00 */
767            *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
768            break;
769
770        default:
771            /* Other sizes are only needed for UCS-4 */
772            errmsg = "unsupported Unicode code range";
773	    goto utf8Error;
774	    break;
775        }
776        s += n;
777	continue;
778
779    utf8Error:
780      if (utf8_decoding_error(&s, &p, errors, errmsg))
781          goto onError;
782    }
783
784    /* Adjust length */
785    if (_PyUnicode_Resize(unicode, p - unicode->str))
786        goto onError;
787
788    return (PyObject *)unicode;
789
790onError:
791    Py_DECREF(unicode);
792    return NULL;
793}
794
795/* Not used anymore, now that the encoder supports UTF-16
796   surrogates. */
797#if 0
798static
799int utf8_encoding_error(const Py_UNICODE **source,
800			char **dest,
801			const char *errors,
802			const char *details)
803{
804    if ((errors == NULL) ||
805	(strcmp(errors,"strict") == 0)) {
806	PyErr_Format(PyExc_UnicodeError,
807		     "UTF-8 encoding error: %.400s",
808		     details);
809	return -1;
810    }
811    else if (strcmp(errors,"ignore") == 0) {
812	return 0;
813    }
814    else if (strcmp(errors,"replace") == 0) {
815	**dest = '?';
816	(*dest)++;
817	return 0;
818    }
819    else {
820	PyErr_Format(PyExc_ValueError,
821		     "UTF-8 encoding error; "
822		     "unknown error handling code: %.400s",
823		     errors);
824	return -1;
825    }
826}
827#endif
828
829PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
830			       int size,
831			       const char *errors)
832{
833    PyObject *v;
834    char *p;
835    char *q;
836    Py_UCS4 ch2;
837    unsigned int cbAllocated = 3 * size;
838    unsigned int cbWritten = 0;
839    int i = 0;
840
841    v = PyString_FromStringAndSize(NULL, cbAllocated);
842    if (v == NULL)
843        return NULL;
844    if (size == 0)
845        return v;
846
847    p = q = PyString_AS_STRING(v);
848    while (i < size) {
849        Py_UCS4 ch = s[i++];
850        if (ch < 0x80) {
851            *p++ = (char) ch;
852            cbWritten++;
853        }
854        else if (ch < 0x0800) {
855            *p++ = 0xc0 | (ch >> 6);
856            *p++ = 0x80 | (ch & 0x3f);
857            cbWritten += 2;
858        }
859        else {
860            /* Check for high surrogate */
861            if (0xD800 <= ch && ch <= 0xDBFF) {
862                if (i != size) {
863                    ch2 = s[i];
864                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
865
866                        if (cbWritten >= (cbAllocated - 4)) {
867			    /* Provide enough room for some more
868			       surrogates */
869			    cbAllocated += 4*10;
870                            if (_PyString_Resize(&v, cbAllocated))
871				goto onError;
872                        }
873
874                        /* combine the two values */
875                        ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
876
877                        *p++ = (char)((ch >> 18) | 0xf0);
878                        *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
879                        i++;
880                        cbWritten += 4;
881                    }
882                }
883            }
884            else {
885                *p++ = (char)(0xe0 | (ch >> 12));
886                cbWritten += 3;
887            }
888            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
889            *p++ = (char)(0x80 | (ch & 0x3f));
890        }
891    }
892    *p = '\0';
893    if (_PyString_Resize(&v, p - q))
894	goto onError;
895    return v;
896
897 onError:
898    Py_DECREF(v);
899    return NULL;
900}
901
902PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
903{
904    if (!PyUnicode_Check(unicode)) {
905        PyErr_BadArgument();
906        return NULL;
907    }
908    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
909				PyUnicode_GET_SIZE(unicode),
910				NULL);
911}
912
913/* --- UTF-16 Codec ------------------------------------------------------- */
914
915static
916int utf16_decoding_error(const Py_UNICODE **source,
917			 Py_UNICODE **dest,
918			 const char *errors,
919			 const char *details)
920{
921    if ((errors == NULL) ||
922        (strcmp(errors,"strict") == 0)) {
923        PyErr_Format(PyExc_UnicodeError,
924                     "UTF-16 decoding error: %.400s",
925                     details);
926        return -1;
927    }
928    else if (strcmp(errors,"ignore") == 0) {
929        return 0;
930    }
931    else if (strcmp(errors,"replace") == 0) {
932	if (dest) {
933	    **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
934	    (*dest)++;
935	}
936        return 0;
937    }
938    else {
939        PyErr_Format(PyExc_ValueError,
940                     "UTF-16 decoding error; "
941		     "unknown error handling code: %.400s",
942                     errors);
943        return -1;
944    }
945}
946
947PyObject *PyUnicode_DecodeUTF16(const char *s,
948				int size,
949				const char *errors,
950				int *byteorder)
951{
952    PyUnicodeObject *unicode;
953    Py_UNICODE *p;
954    const Py_UNICODE *q, *e;
955    int bo = 0;
956    const char *errmsg = "";
957
958    /* size should be an even number */
959    if (size % sizeof(Py_UNICODE) != 0) {
960	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
961	    return NULL;
962	/* The remaining input chars are ignored if we fall through
963           here... */
964    }
965
966    /* Note: size will always be longer than the resulting Unicode
967       character count */
968    unicode = _PyUnicode_New(size);
969    if (!unicode)
970        return NULL;
971    if (size == 0)
972        return (PyObject *)unicode;
973
974    /* Unpack UTF-16 encoded data */
975    p = unicode->str;
976    q = (Py_UNICODE *)s;
977    e = q + (size / sizeof(Py_UNICODE));
978
979    if (byteorder)
980	bo = *byteorder;
981
982    while (q < e) {
983	register Py_UNICODE ch = *q++;
984
985	/* Check for BOM marks (U+FEFF) in the input and adjust
986	   current byte order setting accordingly. Swap input
987	   bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
988	   !) */
989#ifdef BYTEORDER_IS_LITTLE_ENDIAN
990	if (ch == 0xFEFF) {
991	    bo = -1;
992	    continue;
993	} else if (ch == 0xFFFE) {
994	    bo = 1;
995	    continue;
996	}
997	if (bo == 1)
998	    ch = (ch >> 8) | (ch << 8);
999#else
1000	if (ch == 0xFEFF) {
1001	    bo = 1;
1002	    continue;
1003	} else if (ch == 0xFFFE) {
1004	    bo = -1;
1005	    continue;
1006	}
1007	if (bo == -1)
1008	    ch = (ch >> 8) | (ch << 8);
1009#endif
1010	if (ch < 0xD800 || ch > 0xDFFF) {
1011	    *p++ = ch;
1012	    continue;
1013	}
1014
1015	/* UTF-16 code pair: */
1016	if (q >= e) {
1017	    errmsg = "unexpected end of data";
1018	    goto utf16Error;
1019	}
1020	if (0xDC00 <= *q && *q <= 0xDFFF) {
1021	    q++;
1022	    if (0xD800 <= *q && *q <= 0xDBFF) {
1023		/* This is valid data (a UTF-16 surrogate pair), but
1024		   we are not able to store this information since our
1025		   Py_UNICODE type only has 16 bits... this might
1026		   change someday, even though it's unlikely. */
1027		errmsg = "code pairs are not supported";
1028		goto utf16Error;
1029	    }
1030	    else
1031		continue;
1032	}
1033	errmsg = "illegal encoding";
1034	/* Fall through to report the error */
1035
1036    utf16Error:
1037	if (utf16_decoding_error(&q, &p, errors, errmsg))
1038	    goto onError;
1039    }
1040
1041    if (byteorder)
1042        *byteorder = bo;
1043
1044    /* Adjust length */
1045    if (_PyUnicode_Resize(unicode, p - unicode->str))
1046        goto onError;
1047
1048    return (PyObject *)unicode;
1049
1050onError:
1051    Py_DECREF(unicode);
1052    return NULL;
1053}
1054
1055#undef UTF16_ERROR
1056
1057PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1058				int size,
1059				const char *errors,
1060				int byteorder)
1061{
1062    PyObject *v;
1063    Py_UNICODE *p;
1064    char *q;
1065
1066    /* We don't create UTF-16 pairs... */
1067    v = PyString_FromStringAndSize(NULL,
1068			sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1069    if (v == NULL)
1070        return NULL;
1071
1072    q = PyString_AS_STRING(v);
1073    p = (Py_UNICODE *)q;
1074    if (byteorder == 0)
1075	*p++ = 0xFEFF;
1076    if (size == 0)
1077        return v;
1078    if (byteorder == 0 ||
1079#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1080	byteorder == -1
1081#else
1082	byteorder == 1
1083#endif
1084	)
1085	memcpy(p, s, size * sizeof(Py_UNICODE));
1086    else
1087	while (size-- > 0) {
1088	    Py_UNICODE ch = *s++;
1089	    *p++ = (ch >> 8) | (ch << 8);
1090	}
1091    return v;
1092}
1093
1094PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1095{
1096    if (!PyUnicode_Check(unicode)) {
1097        PyErr_BadArgument();
1098        return NULL;
1099    }
1100    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1101				 PyUnicode_GET_SIZE(unicode),
1102				 NULL,
1103				 0);
1104}
1105
1106/* --- Unicode Escape Codec ----------------------------------------------- */
1107
1108static
1109int unicodeescape_decoding_error(const char **source,
1110                                 Py_UNICODE *x,
1111                                 const char *errors,
1112                                 const char *details)
1113{
1114    if ((errors == NULL) ||
1115        (strcmp(errors,"strict") == 0)) {
1116        PyErr_Format(PyExc_UnicodeError,
1117                     "Unicode-Escape decoding error: %.400s",
1118                     details);
1119        return -1;
1120    }
1121    else if (strcmp(errors,"ignore") == 0) {
1122        return 0;
1123    }
1124    else if (strcmp(errors,"replace") == 0) {
1125        *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1126        return 0;
1127    }
1128    else {
1129        PyErr_Format(PyExc_ValueError,
1130                     "Unicode-Escape decoding error; "
1131                     "unknown error handling code: %.400s",
1132                     errors);
1133        return -1;
1134    }
1135}
1136
1137static _Py_UCNHashAPI *pucnHash = NULL;
1138
1139static
1140int mystrnicmp(const char *s1, const char *s2, size_t count)
1141{
1142    char c1, c2;
1143
1144    if (count)
1145    {
1146        do
1147        {
1148           c1 = tolower(*(s1++));
1149           c2 = tolower(*(s2++));
1150        }
1151        while(--count && c1 == c2);
1152
1153        return c1 - c2;
1154    }
1155
1156    return 0;
1157}
1158
1159PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1160					int size,
1161					const char *errors)
1162{
1163    PyUnicodeObject *v;
1164    Py_UNICODE *p = NULL, *buf = NULL;
1165    const char *end;
1166
1167    /* Escaped strings will always be longer than the resulting
1168       Unicode string, so we start with size here and then reduce the
1169       length after conversion to the true value. */
1170    v = _PyUnicode_New(size);
1171    if (v == NULL)
1172        goto onError;
1173    if (size == 0)
1174        return (PyObject *)v;
1175    p = buf = PyUnicode_AS_UNICODE(v);
1176    end = s + size;
1177    while (s < end) {
1178        unsigned char c;
1179        Py_UNICODE x;
1180        int i;
1181
1182        /* Non-escape characters are interpreted as Unicode ordinals */
1183        if (*s != '\\') {
1184            *p++ = (unsigned char)*s++;
1185            continue;
1186        }
1187
1188        /* \ - Escapes */
1189        s++;
1190        switch (*s++) {
1191
1192        /* \x escapes */
1193        case '\n': break;
1194        case '\\': *p++ = '\\'; break;
1195        case '\'': *p++ = '\''; break;
1196        case '\"': *p++ = '\"'; break;
1197        case 'b': *p++ = '\b'; break;
1198        case 'f': *p++ = '\014'; break; /* FF */
1199        case 't': *p++ = '\t'; break;
1200        case 'n': *p++ = '\n'; break;
1201        case 'r': *p++ = '\r'; break;
1202        case 'v': *p++ = '\013'; break; /* VT */
1203        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1204
1205        /* \OOO (octal) escapes */
1206        case '0': case '1': case '2': case '3':
1207        case '4': case '5': case '6': case '7':
1208            x = s[-1] - '0';
1209            if ('0' <= *s && *s <= '7') {
1210                x = (x<<3) + *s++ - '0';
1211                if ('0' <= *s && *s <= '7')
1212                    x = (x<<3) + *s++ - '0';
1213            }
1214            *p++ = x;
1215            break;
1216
1217        /* \xXXXX escape with 1-n hex digits.  for compatibility
1218           with 8-bit strings, this code ignores all but the last
1219           two digits */
1220        case 'x':
1221            x = 0;
1222            c = (unsigned char)*s;
1223            if (isxdigit(c)) {
1224                do {
1225                    x = (x<<4) & 0xF0;
1226                    if ('0' <= c && c <= '9')
1227                        x += c - '0';
1228                    else if ('a' <= c && c <= 'f')
1229                        x += 10 + c - 'a';
1230                    else
1231                        x += 10 + c - 'A';
1232                    c = (unsigned char)*++s;
1233                } while (isxdigit(c));
1234                *p++ = (unsigned char) x;
1235            } else {
1236                *p++ = '\\';
1237                *p++ = (unsigned char)s[-1];
1238            }
1239            break;
1240
1241        /* \uXXXX with 4 hex digits */
1242        case 'u':
1243            for (x = 0, i = 0; i < 4; i++) {
1244                c = (unsigned char)s[i];
1245                if (!isxdigit(c)) {
1246                    if (unicodeescape_decoding_error(&s, &x, errors,
1247                                                     "truncated \\uXXXX"))
1248                        goto onError;
1249                    i++;
1250                    break;
1251                }
1252                x = (x<<4) & ~0xF;
1253                if (c >= '0' && c <= '9')
1254                    x += c - '0';
1255                else if (c >= 'a' && c <= 'f')
1256                    x += 10 + c - 'a';
1257                else
1258                    x += 10 + c - 'A';
1259            }
1260            s += i;
1261            *p++ = x;
1262            break;
1263
1264        case 'N':
1265            /* Ok, we need to deal with Unicode Character Names now,
1266             * make sure we've imported the hash table data...
1267             */
1268            if (pucnHash == NULL)
1269            {
1270                PyObject *mod = 0, *v = 0;
1271
1272                mod = PyImport_ImportModule("ucnhash");
1273                if (mod == NULL)
1274                    goto onError;
1275                v = PyObject_GetAttrString(mod,"ucnhashAPI");
1276                Py_DECREF(mod);
1277                if (v == NULL)
1278                {
1279                    goto onError;
1280                }
1281                pucnHash = PyCObject_AsVoidPtr(v);
1282                Py_DECREF(v);
1283                if (pucnHash == NULL)
1284                {
1285                    goto onError;
1286                }
1287            }
1288
1289            if (*s == '{')
1290            {
1291                const char *start = s + 1;
1292                const char *endBrace = start;
1293                Py_UCS4 value;
1294                unsigned long j;
1295
1296                /* look for either the closing brace, or we
1297                 * exceed the maximum length of the unicode character names
1298                 */
1299                while (*endBrace != '}' &&
1300                       (unsigned int)(endBrace - start) <=
1301                           pucnHash->cchMax &&
1302                       endBrace < end)
1303                {
1304                    endBrace++;
1305                }
1306                if (endBrace != end && *endBrace == '}')
1307                {
1308                    j = pucnHash->hash(start, endBrace - start);
1309                    if (j > pucnHash->cKeys ||
1310                        mystrnicmp(
1311                            start,
1312                            ((_Py_UnicodeCharacterName *)
1313                             (pucnHash->getValue(j)))->pszUCN,
1314                            (int)(endBrace - start)) != 0)
1315                    {
1316                        if (unicodeescape_decoding_error(
1317                                &s, &x, errors,
1318                                "Invalid Unicode Character Name"))
1319                        {
1320                            goto onError;
1321                        }
1322                        goto ucnFallthrough;
1323                    }
1324                    value = ((_Py_UnicodeCharacterName *)
1325                               (pucnHash->getValue(j)))->value;
1326                    if (value < 1<<16)
1327                    {
1328                        /* In UCS-2 range, easy solution.. */
1329                        *p++ = value;
1330                    }
1331                    else
1332                    {
1333                        /* Oops, its in UCS-4 space, */
1334                        /*  compute and append the two surrogates: */
1335                        /*  translate from 10000..10FFFF to 0..FFFFF */
1336                        value -= 0x10000;
1337
1338                        /* high surrogate = top 10 bits added to D800 */
1339                        *p++ = 0xD800 + (value >> 10);
1340
1341                        /* low surrogate  = bottom 10 bits added to DC00 */
1342                        *p++ = 0xDC00 + (value & ~0xFC00);
1343                    }
1344                    s = endBrace + 1;
1345                }
1346                else
1347                {
1348                    if (unicodeescape_decoding_error(
1349                            &s, &x, errors,
1350                            "Unicode name missing closing brace"))
1351                        goto onError;
1352                    goto ucnFallthrough;
1353                }
1354                break;
1355            }
1356            if (unicodeescape_decoding_error(
1357                    &s, &x, errors,
1358                    "Missing opening brace for Unicode Character Name escape"))
1359                goto onError;
1360ucnFallthrough:
1361            /* fall through on purpose */
1362		default:
1363            *p++ = '\\';
1364            *p++ = (unsigned char)s[-1];
1365            break;
1366        }
1367    }
1368    if (_PyUnicode_Resize(v, (int)(p - buf)))
1369		goto onError;
1370    return (PyObject *)v;
1371
1372 onError:
1373    Py_XDECREF(v);
1374    return NULL;
1375}
1376
1377/* Return a Unicode-Escape string version of the Unicode object.
1378
1379   If quotes is true, the string is enclosed in u"" or u'' quotes as
1380   appropriate.
1381
1382*/
1383
1384static const Py_UNICODE *findchar(const Py_UNICODE *s,
1385				  int size,
1386				  Py_UNICODE ch);
1387
1388static
1389PyObject *unicodeescape_string(const Py_UNICODE *s,
1390                               int size,
1391                               int quotes)
1392{
1393    PyObject *repr;
1394    char *p;
1395    char *q;
1396
1397    static const char *hexdigit = "0123456789ABCDEF";
1398
1399    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1400    if (repr == NULL)
1401        return NULL;
1402
1403    p = q = PyString_AS_STRING(repr);
1404
1405    if (quotes) {
1406        *p++ = 'u';
1407        *p++ = (findchar(s, size, '\'') &&
1408                !findchar(s, size, '"')) ? '"' : '\'';
1409    }
1410    while (size-- > 0) {
1411        Py_UNICODE ch = *s++;
1412        /* Escape quotes */
1413        if (quotes && (ch == q[1] || ch == '\\')) {
1414            *p++ = '\\';
1415            *p++ = (char) ch;
1416        }
1417        /* Map 16-bit characters to '\uxxxx' */
1418        else if (ch >= 256) {
1419            *p++ = '\\';
1420            *p++ = 'u';
1421            *p++ = hexdigit[(ch >> 12) & 0xf];
1422            *p++ = hexdigit[(ch >> 8) & 0xf];
1423            *p++ = hexdigit[(ch >> 4) & 0xf];
1424            *p++ = hexdigit[ch & 15];
1425        }
1426        /* Map non-printable US ASCII to '\ooo' */
1427        else if (ch < ' ' || ch >= 128) {
1428            *p++ = '\\';
1429            *p++ = hexdigit[(ch >> 6) & 7];
1430            *p++ = hexdigit[(ch >> 3) & 7];
1431            *p++ = hexdigit[ch & 7];
1432        }
1433        /* Copy everything else as-is */
1434        else
1435            *p++ = (char) ch;
1436    }
1437    if (quotes)
1438        *p++ = q[1];
1439
1440    *p = '\0';
1441    if (_PyString_Resize(&repr, p - q))
1442	goto onError;
1443
1444    return repr;
1445
1446 onError:
1447    Py_DECREF(repr);
1448    return NULL;
1449}
1450
1451PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1452					int size)
1453{
1454    return unicodeescape_string(s, size, 0);
1455}
1456
1457PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1458{
1459    if (!PyUnicode_Check(unicode)) {
1460        PyErr_BadArgument();
1461        return NULL;
1462    }
1463    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1464					 PyUnicode_GET_SIZE(unicode));
1465}
1466
1467/* --- Raw Unicode Escape Codec ------------------------------------------- */
1468
1469PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1470					   int size,
1471					   const char *errors)
1472{
1473    PyUnicodeObject *v;
1474    Py_UNICODE *p, *buf;
1475    const char *end;
1476    const char *bs;
1477
1478    /* Escaped strings will always be longer than the resulting
1479       Unicode string, so we start with size here and then reduce the
1480       length after conversion to the true value. */
1481    v = _PyUnicode_New(size);
1482    if (v == NULL)
1483	goto onError;
1484    if (size == 0)
1485	return (PyObject *)v;
1486    p = buf = PyUnicode_AS_UNICODE(v);
1487    end = s + size;
1488    while (s < end) {
1489	unsigned char c;
1490	Py_UNICODE x;
1491	int i;
1492
1493	/* Non-escape characters are interpreted as Unicode ordinals */
1494	if (*s != '\\') {
1495	    *p++ = (unsigned char)*s++;
1496	    continue;
1497	}
1498
1499	/* \u-escapes are only interpreted iff the number of leading
1500	   backslashes if odd */
1501	bs = s;
1502	for (;s < end;) {
1503	    if (*s != '\\')
1504		break;
1505	    *p++ = (unsigned char)*s++;
1506	}
1507	if (((s - bs) & 1) == 0 ||
1508	    s >= end ||
1509	    *s != 'u') {
1510	    continue;
1511	}
1512	p--;
1513	s++;
1514
1515	/* \uXXXX with 4 hex digits */
1516	for (x = 0, i = 0; i < 4; i++) {
1517	    c = (unsigned char)s[i];
1518	    if (!isxdigit(c)) {
1519		if (unicodeescape_decoding_error(&s, &x, errors,
1520						 "truncated \\uXXXX"))
1521		    goto onError;
1522		i++;
1523		break;
1524	    }
1525	    x = (x<<4) & ~0xF;
1526	    if (c >= '0' && c <= '9')
1527		x += c - '0';
1528	    else if (c >= 'a' && c <= 'f')
1529		x += 10 + c - 'a';
1530	    else
1531		x += 10 + c - 'A';
1532	}
1533	s += i;
1534	*p++ = x;
1535    }
1536    if (_PyUnicode_Resize(v, (int)(p - buf)))
1537	goto onError;
1538    return (PyObject *)v;
1539
1540 onError:
1541    Py_XDECREF(v);
1542    return NULL;
1543}
1544
1545PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1546					   int size)
1547{
1548    PyObject *repr;
1549    char *p;
1550    char *q;
1551
1552    static const char *hexdigit = "0123456789ABCDEF";
1553
1554    repr = PyString_FromStringAndSize(NULL, 6 * size);
1555    if (repr == NULL)
1556        return NULL;
1557    if (size == 0)
1558	return repr;
1559
1560    p = q = PyString_AS_STRING(repr);
1561    while (size-- > 0) {
1562        Py_UNICODE ch = *s++;
1563	/* Map 16-bit characters to '\uxxxx' */
1564	if (ch >= 256) {
1565            *p++ = '\\';
1566            *p++ = 'u';
1567            *p++ = hexdigit[(ch >> 12) & 0xf];
1568            *p++ = hexdigit[(ch >> 8) & 0xf];
1569            *p++ = hexdigit[(ch >> 4) & 0xf];
1570            *p++ = hexdigit[ch & 15];
1571        }
1572	/* Copy everything else as-is */
1573	else
1574            *p++ = (char) ch;
1575    }
1576    *p = '\0';
1577    if (_PyString_Resize(&repr, p - q))
1578	goto onError;
1579
1580    return repr;
1581
1582 onError:
1583    Py_DECREF(repr);
1584    return NULL;
1585}
1586
1587PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1588{
1589    if (!PyUnicode_Check(unicode)) {
1590	PyErr_BadArgument();
1591	return NULL;
1592    }
1593    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1594					    PyUnicode_GET_SIZE(unicode));
1595}
1596
1597/* --- Latin-1 Codec ------------------------------------------------------ */
1598
1599PyObject *PyUnicode_DecodeLatin1(const char *s,
1600				 int size,
1601				 const char *errors)
1602{
1603    PyUnicodeObject *v;
1604    Py_UNICODE *p;
1605
1606    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1607    v = _PyUnicode_New(size);
1608    if (v == NULL)
1609	goto onError;
1610    if (size == 0)
1611	return (PyObject *)v;
1612    p = PyUnicode_AS_UNICODE(v);
1613    while (size-- > 0)
1614	*p++ = (unsigned char)*s++;
1615    return (PyObject *)v;
1616
1617 onError:
1618    Py_XDECREF(v);
1619    return NULL;
1620}
1621
1622static
1623int latin1_encoding_error(const Py_UNICODE **source,
1624			  char **dest,
1625			  const char *errors,
1626			  const char *details)
1627{
1628    if ((errors == NULL) ||
1629	(strcmp(errors,"strict") == 0)) {
1630	PyErr_Format(PyExc_UnicodeError,
1631		     "Latin-1 encoding error: %.400s",
1632		     details);
1633	return -1;
1634    }
1635    else if (strcmp(errors,"ignore") == 0) {
1636	return 0;
1637    }
1638    else if (strcmp(errors,"replace") == 0) {
1639	**dest = '?';
1640	(*dest)++;
1641	return 0;
1642    }
1643    else {
1644	PyErr_Format(PyExc_ValueError,
1645		     "Latin-1 encoding error; "
1646		     "unknown error handling code: %.400s",
1647		     errors);
1648	return -1;
1649    }
1650}
1651
1652PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1653				 int size,
1654				 const char *errors)
1655{
1656    PyObject *repr;
1657    char *s, *start;
1658
1659    repr = PyString_FromStringAndSize(NULL, size);
1660    if (repr == NULL)
1661        return NULL;
1662    if (size == 0)
1663	return repr;
1664
1665    s = PyString_AS_STRING(repr);
1666    start = s;
1667    while (size-- > 0) {
1668        Py_UNICODE ch = *p++;
1669	if (ch >= 256) {
1670	    if (latin1_encoding_error(&p, &s, errors,
1671				      "ordinal not in range(256)"))
1672		goto onError;
1673	}
1674	else
1675            *s++ = (char)ch;
1676    }
1677    /* Resize if error handling skipped some characters */
1678    if (s - start < PyString_GET_SIZE(repr))
1679	if (_PyString_Resize(&repr, s - start))
1680	    goto onError;
1681    return repr;
1682
1683 onError:
1684    Py_DECREF(repr);
1685    return NULL;
1686}
1687
1688PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1689{
1690    if (!PyUnicode_Check(unicode)) {
1691	PyErr_BadArgument();
1692	return NULL;
1693    }
1694    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1695				  PyUnicode_GET_SIZE(unicode),
1696				  NULL);
1697}
1698
1699/* --- 7-bit ASCII Codec -------------------------------------------------- */
1700
1701static
1702int ascii_decoding_error(const char **source,
1703			 Py_UNICODE **dest,
1704			 const char *errors,
1705			 const char *details)
1706{
1707    if ((errors == NULL) ||
1708	(strcmp(errors,"strict") == 0)) {
1709	PyErr_Format(PyExc_UnicodeError,
1710		     "ASCII decoding error: %.400s",
1711		     details);
1712	return -1;
1713    }
1714    else if (strcmp(errors,"ignore") == 0) {
1715	return 0;
1716    }
1717    else if (strcmp(errors,"replace") == 0) {
1718	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1719	(*dest)++;
1720	return 0;
1721    }
1722    else {
1723	PyErr_Format(PyExc_ValueError,
1724		     "ASCII decoding error; "
1725		     "unknown error handling code: %.400s",
1726		     errors);
1727	return -1;
1728    }
1729}
1730
1731PyObject *PyUnicode_DecodeASCII(const char *s,
1732				int size,
1733				const char *errors)
1734{
1735    PyUnicodeObject *v;
1736    Py_UNICODE *p;
1737
1738    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1739    v = _PyUnicode_New(size);
1740    if (v == NULL)
1741	goto onError;
1742    if (size == 0)
1743	return (PyObject *)v;
1744    p = PyUnicode_AS_UNICODE(v);
1745    while (size-- > 0) {
1746	register unsigned char c;
1747
1748	c = (unsigned char)*s++;
1749	if (c < 128)
1750	    *p++ = c;
1751	else if (ascii_decoding_error(&s, &p, errors,
1752				      "ordinal not in range(128)"))
1753		goto onError;
1754    }
1755    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1756	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1757	    goto onError;
1758    return (PyObject *)v;
1759
1760 onError:
1761    Py_XDECREF(v);
1762    return NULL;
1763}
1764
1765static
1766int ascii_encoding_error(const Py_UNICODE **source,
1767			 char **dest,
1768			 const char *errors,
1769			 const char *details)
1770{
1771    if ((errors == NULL) ||
1772	(strcmp(errors,"strict") == 0)) {
1773	PyErr_Format(PyExc_UnicodeError,
1774		     "ASCII encoding error: %.400s",
1775		     details);
1776	return -1;
1777    }
1778    else if (strcmp(errors,"ignore") == 0) {
1779	return 0;
1780    }
1781    else if (strcmp(errors,"replace") == 0) {
1782	**dest = '?';
1783	(*dest)++;
1784	return 0;
1785    }
1786    else {
1787	PyErr_Format(PyExc_ValueError,
1788		     "ASCII encoding error; "
1789		     "unknown error handling code: %.400s",
1790		     errors);
1791	return -1;
1792    }
1793}
1794
1795PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1796				int size,
1797				const char *errors)
1798{
1799    PyObject *repr;
1800    char *s, *start;
1801
1802    repr = PyString_FromStringAndSize(NULL, size);
1803    if (repr == NULL)
1804        return NULL;
1805    if (size == 0)
1806	return repr;
1807
1808    s = PyString_AS_STRING(repr);
1809    start = s;
1810    while (size-- > 0) {
1811        Py_UNICODE ch = *p++;
1812	if (ch >= 128) {
1813	    if (ascii_encoding_error(&p, &s, errors,
1814				      "ordinal not in range(128)"))
1815		goto onError;
1816	}
1817	else
1818            *s++ = (char)ch;
1819    }
1820    /* Resize if error handling skipped some characters */
1821    if (s - start < PyString_GET_SIZE(repr))
1822	if (_PyString_Resize(&repr, s - start))
1823	    goto onError;
1824    return repr;
1825
1826 onError:
1827    Py_DECREF(repr);
1828    return NULL;
1829}
1830
1831PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1832{
1833    if (!PyUnicode_Check(unicode)) {
1834	PyErr_BadArgument();
1835	return NULL;
1836    }
1837    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1838				 PyUnicode_GET_SIZE(unicode),
1839				 NULL);
1840}
1841
1842#ifdef MS_WIN32
1843
1844/* --- MBCS codecs for Windows -------------------------------------------- */
1845
1846PyObject *PyUnicode_DecodeMBCS(const char *s,
1847				int size,
1848				const char *errors)
1849{
1850    PyUnicodeObject *v;
1851    Py_UNICODE *p;
1852
1853    /* First get the size of the result */
1854    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1855    if (size > 0 && usize==0)
1856        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1857
1858    v = _PyUnicode_New(usize);
1859    if (v == NULL)
1860        return NULL;
1861    if (usize == 0)
1862	return (PyObject *)v;
1863    p = PyUnicode_AS_UNICODE(v);
1864    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1865        Py_DECREF(v);
1866        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1867    }
1868
1869    return (PyObject *)v;
1870}
1871
1872PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1873				int size,
1874				const char *errors)
1875{
1876    PyObject *repr;
1877    char *s;
1878    DWORD mbcssize;
1879
1880    /* If there are no characters, bail now! */
1881    if (size==0)
1882	    return PyString_FromString("");
1883
1884    /* First get the size of the result */
1885    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1886    if (mbcssize==0)
1887        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1888
1889    repr = PyString_FromStringAndSize(NULL, mbcssize);
1890    if (repr == NULL)
1891        return NULL;
1892    if (mbcssize == 0)
1893        return repr;
1894
1895    /* Do the conversion */
1896    s = PyString_AS_STRING(repr);
1897    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1898        Py_DECREF(repr);
1899        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1900    }
1901    return repr;
1902}
1903
1904#endif /* MS_WIN32 */
1905
1906/* --- Character Mapping Codec -------------------------------------------- */
1907
1908static
1909int charmap_decoding_error(const char **source,
1910			 Py_UNICODE **dest,
1911			 const char *errors,
1912			 const char *details)
1913{
1914    if ((errors == NULL) ||
1915	(strcmp(errors,"strict") == 0)) {
1916	PyErr_Format(PyExc_UnicodeError,
1917		     "charmap decoding error: %.400s",
1918		     details);
1919	return -1;
1920    }
1921    else if (strcmp(errors,"ignore") == 0) {
1922	return 0;
1923    }
1924    else if (strcmp(errors,"replace") == 0) {
1925	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1926	(*dest)++;
1927	return 0;
1928    }
1929    else {
1930	PyErr_Format(PyExc_ValueError,
1931		     "charmap decoding error; "
1932		     "unknown error handling code: %.400s",
1933		     errors);
1934	return -1;
1935    }
1936}
1937
1938PyObject *PyUnicode_DecodeCharmap(const char *s,
1939				  int size,
1940				  PyObject *mapping,
1941				  const char *errors)
1942{
1943    PyUnicodeObject *v;
1944    Py_UNICODE *p;
1945
1946    /* Default to Latin-1 */
1947    if (mapping == NULL)
1948	return PyUnicode_DecodeLatin1(s, size, errors);
1949
1950    v = _PyUnicode_New(size);
1951    if (v == NULL)
1952	goto onError;
1953    if (size == 0)
1954	return (PyObject *)v;
1955    p = PyUnicode_AS_UNICODE(v);
1956    while (size-- > 0) {
1957	unsigned char ch = *s++;
1958	PyObject *w, *x;
1959
1960	/* Get mapping (char ordinal -> integer, Unicode char or None) */
1961	w = PyInt_FromLong((long)ch);
1962	if (w == NULL)
1963	    goto onError;
1964	x = PyObject_GetItem(mapping, w);
1965	Py_DECREF(w);
1966	if (x == NULL) {
1967	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1968		/* No mapping found: default to Latin-1 mapping */
1969		PyErr_Clear();
1970		*p++ = (Py_UNICODE)ch;
1971		continue;
1972	    }
1973	    goto onError;
1974	}
1975
1976	/* Apply mapping */
1977	if (PyInt_Check(x)) {
1978	    long value = PyInt_AS_LONG(x);
1979	    if (value < 0 || value > 65535) {
1980		PyErr_SetString(PyExc_TypeError,
1981				"character mapping must be in range(65536)");
1982		Py_DECREF(x);
1983		goto onError;
1984	    }
1985	    *p++ = (Py_UNICODE)value;
1986	}
1987	else if (x == Py_None) {
1988	    /* undefined mapping */
1989	    if (charmap_decoding_error(&s, &p, errors,
1990				       "character maps to <undefined>")) {
1991		Py_DECREF(x);
1992		goto onError;
1993	    }
1994	}
1995	else if (PyUnicode_Check(x)) {
1996	    if (PyUnicode_GET_SIZE(x) != 1) {
1997		/* 1-n mapping */
1998		PyErr_SetString(PyExc_NotImplementedError,
1999				"1-n mappings are currently not implemented");
2000		Py_DECREF(x);
2001		goto onError;
2002	    }
2003	    *p++ = *PyUnicode_AS_UNICODE(x);
2004	}
2005	else {
2006	    /* wrong return value */
2007	    PyErr_SetString(PyExc_TypeError,
2008		  "character mapping must return integer, None or unicode");
2009	    Py_DECREF(x);
2010	    goto onError;
2011	}
2012	Py_DECREF(x);
2013    }
2014    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2015	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2016	    goto onError;
2017    return (PyObject *)v;
2018
2019 onError:
2020    Py_XDECREF(v);
2021    return NULL;
2022}
2023
2024static
2025int charmap_encoding_error(const Py_UNICODE **source,
2026			   char **dest,
2027			   const char *errors,
2028			   const char *details)
2029{
2030    if ((errors == NULL) ||
2031	(strcmp(errors,"strict") == 0)) {
2032	PyErr_Format(PyExc_UnicodeError,
2033		     "charmap encoding error: %.400s",
2034		     details);
2035	return -1;
2036    }
2037    else if (strcmp(errors,"ignore") == 0) {
2038	return 0;
2039    }
2040    else if (strcmp(errors,"replace") == 0) {
2041	**dest = '?';
2042	(*dest)++;
2043	return 0;
2044    }
2045    else {
2046	PyErr_Format(PyExc_ValueError,
2047		     "charmap encoding error; "
2048		     "unknown error handling code: %.400s",
2049		     errors);
2050	return -1;
2051    }
2052}
2053
2054PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2055				  int size,
2056				  PyObject *mapping,
2057				  const char *errors)
2058{
2059    PyObject *v;
2060    char *s;
2061
2062    /* Default to Latin-1 */
2063    if (mapping == NULL)
2064	return PyUnicode_EncodeLatin1(p, size, errors);
2065
2066    v = PyString_FromStringAndSize(NULL, size);
2067    if (v == NULL)
2068        return NULL;
2069    if (size == 0)
2070	return v;
2071    s = PyString_AS_STRING(v);
2072    while (size-- > 0) {
2073	Py_UNICODE ch = *p++;
2074	PyObject *w, *x;
2075
2076	/* Get mapping (Unicode ordinal -> string char, integer or None) */
2077	w = PyInt_FromLong((long)ch);
2078	if (w == NULL)
2079	    goto onError;
2080	x = PyObject_GetItem(mapping, w);
2081	Py_DECREF(w);
2082	if (x == NULL) {
2083	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2084		/* No mapping found: default to Latin-1 mapping if possible */
2085		PyErr_Clear();
2086		if (ch < 256) {
2087		    *s++ = (char)ch;
2088		    continue;
2089		}
2090		else if (!charmap_encoding_error(&p, &s, errors,
2091				     "missing character mapping"))
2092		    continue;
2093	    }
2094	    goto onError;
2095	}
2096
2097	/* Apply mapping */
2098	if (PyInt_Check(x)) {
2099	    long value = PyInt_AS_LONG(x);
2100	    if (value < 0 || value > 255) {
2101		PyErr_SetString(PyExc_TypeError,
2102				"character mapping must be in range(256)");
2103		Py_DECREF(x);
2104		goto onError;
2105	    }
2106	    *s++ = (char)value;
2107	}
2108	else if (x == Py_None) {
2109	    /* undefined mapping */
2110	    if (charmap_encoding_error(&p, &s, errors,
2111				       "character maps to <undefined>")) {
2112		Py_DECREF(x);
2113		goto onError;
2114	    }
2115	}
2116	else if (PyString_Check(x)) {
2117	    if (PyString_GET_SIZE(x) != 1) {
2118		/* 1-n mapping */
2119		PyErr_SetString(PyExc_NotImplementedError,
2120		      "1-n mappings are currently not implemented");
2121		Py_DECREF(x);
2122		goto onError;
2123	    }
2124	    *s++ = *PyString_AS_STRING(x);
2125	}
2126	else {
2127	    /* wrong return value */
2128	    PyErr_SetString(PyExc_TypeError,
2129		  "character mapping must return integer, None or unicode");
2130	    Py_DECREF(x);
2131	    goto onError;
2132	}
2133	Py_DECREF(x);
2134    }
2135    if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2136	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2137	    goto onError;
2138    return v;
2139
2140 onError:
2141    Py_DECREF(v);
2142    return NULL;
2143}
2144
2145PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2146				    PyObject *mapping)
2147{
2148    if (!PyUnicode_Check(unicode) || mapping == NULL) {
2149	PyErr_BadArgument();
2150	return NULL;
2151    }
2152    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2153				   PyUnicode_GET_SIZE(unicode),
2154				   mapping,
2155				   NULL);
2156}
2157
2158static
2159int translate_error(const Py_UNICODE **source,
2160		    Py_UNICODE **dest,
2161		    const char *errors,
2162		    const char *details)
2163{
2164    if ((errors == NULL) ||
2165	(strcmp(errors,"strict") == 0)) {
2166	PyErr_Format(PyExc_UnicodeError,
2167		     "translate error: %.400s",
2168		     details);
2169	return -1;
2170    }
2171    else if (strcmp(errors,"ignore") == 0) {
2172	return 0;
2173    }
2174    else if (strcmp(errors,"replace") == 0) {
2175	**dest = '?';
2176	(*dest)++;
2177	return 0;
2178    }
2179    else {
2180	PyErr_Format(PyExc_ValueError,
2181		     "translate error; "
2182		     "unknown error handling code: %.400s",
2183		     errors);
2184	return -1;
2185    }
2186}
2187
2188PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2189				     int size,
2190				     PyObject *mapping,
2191				     const char *errors)
2192{
2193    PyUnicodeObject *v;
2194    Py_UNICODE *p;
2195
2196    if (mapping == NULL) {
2197	PyErr_BadArgument();
2198	return NULL;
2199    }
2200
2201    /* Output will never be longer than input */
2202    v = _PyUnicode_New(size);
2203    if (v == NULL)
2204	goto onError;
2205    if (size == 0)
2206	goto done;
2207    p = PyUnicode_AS_UNICODE(v);
2208    while (size-- > 0) {
2209	Py_UNICODE ch = *s++;
2210	PyObject *w, *x;
2211
2212	/* Get mapping */
2213	w = PyInt_FromLong(ch);
2214	if (w == NULL)
2215	    goto onError;
2216	x = PyObject_GetItem(mapping, w);
2217	Py_DECREF(w);
2218	if (x == NULL) {
2219	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2220		/* No mapping found: default to 1-1 mapping */
2221		PyErr_Clear();
2222		*p++ = ch;
2223		continue;
2224	    }
2225	    goto onError;
2226	}
2227
2228	/* Apply mapping */
2229	if (PyInt_Check(x))
2230	    *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2231	else if (x == Py_None) {
2232	    /* undefined mapping */
2233	    if (translate_error(&s, &p, errors,
2234				"character maps to <undefined>")) {
2235		Py_DECREF(x);
2236		goto onError;
2237	    }
2238	}
2239	else if (PyUnicode_Check(x)) {
2240	    if (PyUnicode_GET_SIZE(x) != 1) {
2241		/* 1-n mapping */
2242		PyErr_SetString(PyExc_NotImplementedError,
2243				"1-n mappings are currently not implemented");
2244		Py_DECREF(x);
2245		goto onError;
2246	    }
2247	    *p++ = *PyUnicode_AS_UNICODE(x);
2248	}
2249	else {
2250	    /* wrong return value */
2251	    PyErr_SetString(PyExc_TypeError,
2252		  "translate mapping must return integer, None or unicode");
2253	    Py_DECREF(x);
2254	    goto onError;
2255	}
2256	Py_DECREF(x);
2257    }
2258    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2259	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2260	    goto onError;
2261
2262 done:
2263    return (PyObject *)v;
2264
2265 onError:
2266    Py_XDECREF(v);
2267    return NULL;
2268}
2269
2270PyObject *PyUnicode_Translate(PyObject *str,
2271			      PyObject *mapping,
2272			      const char *errors)
2273{
2274    PyObject *result;
2275
2276    str = PyUnicode_FromObject(str);
2277    if (str == NULL)
2278	goto onError;
2279    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2280					PyUnicode_GET_SIZE(str),
2281					mapping,
2282					errors);
2283    Py_DECREF(str);
2284    return result;
2285
2286 onError:
2287    Py_XDECREF(str);
2288    return NULL;
2289}
2290
2291/* --- Decimal Encoder ---------------------------------------------------- */
2292
2293int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2294			    int length,
2295			    char *output,
2296			    const char *errors)
2297{
2298    Py_UNICODE *p, *end;
2299
2300    if (output == NULL) {
2301	PyErr_BadArgument();
2302	return -1;
2303    }
2304
2305    p = s;
2306    end = s + length;
2307    while (p < end) {
2308	register Py_UNICODE ch = *p++;
2309	int decimal;
2310
2311	if (Py_UNICODE_ISSPACE(ch)) {
2312	    *output++ = ' ';
2313	    continue;
2314	}
2315	decimal = Py_UNICODE_TODECIMAL(ch);
2316	if (decimal >= 0) {
2317	    *output++ = '0' + decimal;
2318	    continue;
2319	}
2320	if (0 < ch && ch < 256) {
2321	    *output++ = (char)ch;
2322	    continue;
2323	}
2324	/* All other characters are considered invalid */
2325	if (errors == NULL || strcmp(errors, "strict") == 0) {
2326	    PyErr_SetString(PyExc_ValueError,
2327			    "invalid decimal Unicode string");
2328	    goto onError;
2329	}
2330	else if (strcmp(errors, "ignore") == 0)
2331	    continue;
2332	else if (strcmp(errors, "replace") == 0) {
2333	    *output++ = '?';
2334	    continue;
2335	}
2336    }
2337    /* 0-terminate the output string */
2338    *output++ = '\0';
2339    return 0;
2340
2341 onError:
2342    return -1;
2343}
2344
2345/* --- Helpers ------------------------------------------------------------ */
2346
2347static
2348int count(PyUnicodeObject *self,
2349	  int start,
2350	  int end,
2351	  PyUnicodeObject *substring)
2352{
2353    int count = 0;
2354
2355    if (substring->length == 0)
2356	return (end - start + 1);
2357
2358    end -= substring->length;
2359
2360    while (start <= end)
2361        if (Py_UNICODE_MATCH(self, start, substring)) {
2362            count++;
2363            start += substring->length;
2364        } else
2365            start++;
2366
2367    return count;
2368}
2369
2370int PyUnicode_Count(PyObject *str,
2371		    PyObject *substr,
2372		    int start,
2373		    int end)
2374{
2375    int result;
2376
2377    str = PyUnicode_FromObject(str);
2378    if (str == NULL)
2379	return -1;
2380    substr = PyUnicode_FromObject(substr);
2381    if (substr == NULL) {
2382	Py_DECREF(str);
2383	return -1;
2384    }
2385
2386    result = count((PyUnicodeObject *)str,
2387		   start, end,
2388		   (PyUnicodeObject *)substr);
2389
2390    Py_DECREF(str);
2391    Py_DECREF(substr);
2392    return result;
2393}
2394
2395static
2396int findstring(PyUnicodeObject *self,
2397	       PyUnicodeObject *substring,
2398	       int start,
2399	       int end,
2400	       int direction)
2401{
2402    if (start < 0)
2403        start += self->length;
2404    if (start < 0)
2405        start = 0;
2406
2407    if (substring->length == 0)
2408        return start;
2409
2410    if (end > self->length)
2411        end = self->length;
2412    if (end < 0)
2413        end += self->length;
2414    if (end < 0)
2415        end = 0;
2416
2417    end -= substring->length;
2418
2419    if (direction < 0) {
2420        for (; end >= start; end--)
2421            if (Py_UNICODE_MATCH(self, end, substring))
2422                return end;
2423    } else {
2424        for (; start <= end; start++)
2425            if (Py_UNICODE_MATCH(self, start, substring))
2426                return start;
2427    }
2428
2429    return -1;
2430}
2431
2432int PyUnicode_Find(PyObject *str,
2433		   PyObject *substr,
2434		   int start,
2435		   int end,
2436		   int direction)
2437{
2438    int result;
2439
2440    str = PyUnicode_FromObject(str);
2441    if (str == NULL)
2442	return -1;
2443    substr = PyUnicode_FromObject(substr);
2444    if (substr == NULL) {
2445	Py_DECREF(substr);
2446	return -1;
2447    }
2448
2449    result = findstring((PyUnicodeObject *)str,
2450			(PyUnicodeObject *)substr,
2451			start, end, direction);
2452    Py_DECREF(str);
2453    Py_DECREF(substr);
2454    return result;
2455}
2456
2457static
2458int tailmatch(PyUnicodeObject *self,
2459	      PyUnicodeObject *substring,
2460	      int start,
2461	      int end,
2462	      int direction)
2463{
2464    if (start < 0)
2465        start += self->length;
2466    if (start < 0)
2467        start = 0;
2468
2469    if (substring->length == 0)
2470        return 1;
2471
2472    if (end > self->length)
2473        end = self->length;
2474    if (end < 0)
2475        end += self->length;
2476    if (end < 0)
2477        end = 0;
2478
2479    end -= substring->length;
2480    if (end < start)
2481	return 0;
2482
2483    if (direction > 0) {
2484	if (Py_UNICODE_MATCH(self, end, substring))
2485	    return 1;
2486    } else {
2487        if (Py_UNICODE_MATCH(self, start, substring))
2488	    return 1;
2489    }
2490
2491    return 0;
2492}
2493
2494int PyUnicode_Tailmatch(PyObject *str,
2495			PyObject *substr,
2496			int start,
2497			int end,
2498			int direction)
2499{
2500    int result;
2501
2502    str = PyUnicode_FromObject(str);
2503    if (str == NULL)
2504	return -1;
2505    substr = PyUnicode_FromObject(substr);
2506    if (substr == NULL) {
2507	Py_DECREF(substr);
2508	return -1;
2509    }
2510
2511    result = tailmatch((PyUnicodeObject *)str,
2512		       (PyUnicodeObject *)substr,
2513		       start, end, direction);
2514    Py_DECREF(str);
2515    Py_DECREF(substr);
2516    return result;
2517}
2518
2519static
2520const Py_UNICODE *findchar(const Py_UNICODE *s,
2521		     int size,
2522		     Py_UNICODE ch)
2523{
2524    /* like wcschr, but doesn't stop at NULL characters */
2525
2526    while (size-- > 0) {
2527        if (*s == ch)
2528            return s;
2529        s++;
2530    }
2531
2532    return NULL;
2533}
2534
2535/* Apply fixfct filter to the Unicode object self and return a
2536   reference to the modified object */
2537
2538static
2539PyObject *fixup(PyUnicodeObject *self,
2540		int (*fixfct)(PyUnicodeObject *s))
2541{
2542
2543    PyUnicodeObject *u;
2544
2545    u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2546						 self->length);
2547    if (u == NULL)
2548	return NULL;
2549    if (!fixfct(u)) {
2550	/* fixfct should return TRUE if it modified the buffer. If
2551	   FALSE, return a reference to the original buffer instead
2552	   (to save space, not time) */
2553	Py_INCREF(self);
2554	Py_DECREF(u);
2555	return (PyObject*) self;
2556    }
2557    return (PyObject*) u;
2558}
2559
2560static
2561int fixupper(PyUnicodeObject *self)
2562{
2563    int len = self->length;
2564    Py_UNICODE *s = self->str;
2565    int status = 0;
2566
2567    while (len-- > 0) {
2568	register Py_UNICODE ch;
2569
2570	ch = Py_UNICODE_TOUPPER(*s);
2571	if (ch != *s) {
2572            status = 1;
2573	    *s = ch;
2574	}
2575        s++;
2576    }
2577
2578    return status;
2579}
2580
2581static
2582int fixlower(PyUnicodeObject *self)
2583{
2584    int len = self->length;
2585    Py_UNICODE *s = self->str;
2586    int status = 0;
2587
2588    while (len-- > 0) {
2589	register Py_UNICODE ch;
2590
2591	ch = Py_UNICODE_TOLOWER(*s);
2592	if (ch != *s) {
2593            status = 1;
2594	    *s = ch;
2595	}
2596        s++;
2597    }
2598
2599    return status;
2600}
2601
2602static
2603int fixswapcase(PyUnicodeObject *self)
2604{
2605    int len = self->length;
2606    Py_UNICODE *s = self->str;
2607    int status = 0;
2608
2609    while (len-- > 0) {
2610        if (Py_UNICODE_ISUPPER(*s)) {
2611            *s = Py_UNICODE_TOLOWER(*s);
2612            status = 1;
2613        } else if (Py_UNICODE_ISLOWER(*s)) {
2614            *s = Py_UNICODE_TOUPPER(*s);
2615            status = 1;
2616        }
2617        s++;
2618    }
2619
2620    return status;
2621}
2622
2623static
2624int fixcapitalize(PyUnicodeObject *self)
2625{
2626    if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2627	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2628	return 1;
2629    }
2630    return 0;
2631}
2632
2633static
2634int fixtitle(PyUnicodeObject *self)
2635{
2636    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2637    register Py_UNICODE *e;
2638    int previous_is_cased;
2639
2640    /* Shortcut for single character strings */
2641    if (PyUnicode_GET_SIZE(self) == 1) {
2642	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2643	if (*p != ch) {
2644	    *p = ch;
2645	    return 1;
2646	}
2647	else
2648	    return 0;
2649    }
2650
2651    e = p + PyUnicode_GET_SIZE(self);
2652    previous_is_cased = 0;
2653    for (; p < e; p++) {
2654	register const Py_UNICODE ch = *p;
2655
2656	if (previous_is_cased)
2657	    *p = Py_UNICODE_TOLOWER(ch);
2658	else
2659	    *p = Py_UNICODE_TOTITLE(ch);
2660
2661	if (Py_UNICODE_ISLOWER(ch) ||
2662	    Py_UNICODE_ISUPPER(ch) ||
2663	    Py_UNICODE_ISTITLE(ch))
2664	    previous_is_cased = 1;
2665	else
2666	    previous_is_cased = 0;
2667    }
2668    return 1;
2669}
2670
2671PyObject *PyUnicode_Join(PyObject *separator,
2672			 PyObject *seq)
2673{
2674    Py_UNICODE *sep;
2675    int seplen;
2676    PyUnicodeObject *res = NULL;
2677    int reslen = 0;
2678    Py_UNICODE *p;
2679    int seqlen = 0;
2680    int sz = 100;
2681    int i;
2682
2683    seqlen = PySequence_Size(seq);
2684    if (seqlen < 0 && PyErr_Occurred())
2685	return NULL;
2686
2687    if (separator == NULL) {
2688	Py_UNICODE blank = ' ';
2689	sep = &blank;
2690	seplen = 1;
2691    }
2692    else {
2693	separator = PyUnicode_FromObject(separator);
2694	if (separator == NULL)
2695	    return NULL;
2696	sep = PyUnicode_AS_UNICODE(separator);
2697	seplen = PyUnicode_GET_SIZE(separator);
2698    }
2699
2700    res = _PyUnicode_New(sz);
2701    if (res == NULL)
2702	goto onError;
2703    p = PyUnicode_AS_UNICODE(res);
2704    reslen = 0;
2705
2706    for (i = 0; i < seqlen; i++) {
2707	int itemlen;
2708	PyObject *item;
2709
2710	item = PySequence_GetItem(seq, i);
2711	if (item == NULL)
2712	    goto onError;
2713	if (!PyUnicode_Check(item)) {
2714	    PyObject *v;
2715	    v = PyUnicode_FromObject(item);
2716	    Py_DECREF(item);
2717	    item = v;
2718	    if (item == NULL)
2719		goto onError;
2720	}
2721	itemlen = PyUnicode_GET_SIZE(item);
2722	while (reslen + itemlen + seplen >= sz) {
2723	    if (_PyUnicode_Resize(res, sz*2))
2724		goto onError;
2725	    sz *= 2;
2726	    p = PyUnicode_AS_UNICODE(res) + reslen;
2727	}
2728	if (i > 0) {
2729	    memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2730	    p += seplen;
2731	    reslen += seplen;
2732	}
2733	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2734	p += itemlen;
2735	reslen += itemlen;
2736	Py_DECREF(item);
2737    }
2738    if (_PyUnicode_Resize(res, reslen))
2739	goto onError;
2740
2741    Py_XDECREF(separator);
2742    return (PyObject *)res;
2743
2744 onError:
2745    Py_XDECREF(separator);
2746    Py_DECREF(res);
2747    return NULL;
2748}
2749
2750static
2751PyUnicodeObject *pad(PyUnicodeObject *self,
2752		     int left,
2753		     int right,
2754		     Py_UNICODE fill)
2755{
2756    PyUnicodeObject *u;
2757
2758    if (left < 0)
2759        left = 0;
2760    if (right < 0)
2761        right = 0;
2762
2763    if (left == 0 && right == 0) {
2764        Py_INCREF(self);
2765        return self;
2766    }
2767
2768    u = _PyUnicode_New(left + self->length + right);
2769    if (u) {
2770        if (left)
2771            Py_UNICODE_FILL(u->str, fill, left);
2772        Py_UNICODE_COPY(u->str + left, self->str, self->length);
2773        if (right)
2774            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2775    }
2776
2777    return u;
2778}
2779
2780#define SPLIT_APPEND(data, left, right)					\
2781	str = PyUnicode_FromUnicode(data + left, right - left);		\
2782	if (!str)							\
2783	    goto onError;						\
2784	if (PyList_Append(list, str)) {					\
2785	    Py_DECREF(str);						\
2786	    goto onError;						\
2787	}								\
2788        else								\
2789            Py_DECREF(str);
2790
2791static
2792PyObject *split_whitespace(PyUnicodeObject *self,
2793			   PyObject *list,
2794			   int maxcount)
2795{
2796    register int i;
2797    register int j;
2798    int len = self->length;
2799    PyObject *str;
2800
2801    for (i = j = 0; i < len; ) {
2802	/* find a token */
2803	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2804	    i++;
2805	j = i;
2806	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2807	    i++;
2808	if (j < i) {
2809	    if (maxcount-- <= 0)
2810		break;
2811	    SPLIT_APPEND(self->str, j, i);
2812	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2813		i++;
2814	    j = i;
2815	}
2816    }
2817    if (j < len) {
2818	SPLIT_APPEND(self->str, j, len);
2819    }
2820    return list;
2821
2822 onError:
2823    Py_DECREF(list);
2824    return NULL;
2825}
2826
2827PyObject *PyUnicode_Splitlines(PyObject *string,
2828			       int keepends)
2829{
2830    register int i;
2831    register int j;
2832    int len;
2833    PyObject *list;
2834    PyObject *str;
2835    Py_UNICODE *data;
2836
2837    string = PyUnicode_FromObject(string);
2838    if (string == NULL)
2839	return NULL;
2840    data = PyUnicode_AS_UNICODE(string);
2841    len = PyUnicode_GET_SIZE(string);
2842
2843    list = PyList_New(0);
2844    if (!list)
2845        goto onError;
2846
2847    for (i = j = 0; i < len; ) {
2848	int eol;
2849
2850	/* Find a line and append it */
2851	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2852	    i++;
2853
2854	/* Skip the line break reading CRLF as one line break */
2855	eol = i;
2856	if (i < len) {
2857	    if (data[i] == '\r' && i + 1 < len &&
2858		data[i+1] == '\n')
2859		i += 2;
2860	    else
2861		i++;
2862	    if (keepends)
2863		eol = i;
2864	}
2865	SPLIT_APPEND(data, j, eol);
2866	j = i;
2867    }
2868    if (j < len) {
2869	SPLIT_APPEND(data, j, len);
2870    }
2871
2872    Py_DECREF(string);
2873    return list;
2874
2875 onError:
2876    Py_DECREF(list);
2877    Py_DECREF(string);
2878    return NULL;
2879}
2880
2881static
2882PyObject *split_char(PyUnicodeObject *self,
2883		     PyObject *list,
2884		     Py_UNICODE ch,
2885		     int maxcount)
2886{
2887    register int i;
2888    register int j;
2889    int len = self->length;
2890    PyObject *str;
2891
2892    for (i = j = 0; i < len; ) {
2893	if (self->str[i] == ch) {
2894	    if (maxcount-- <= 0)
2895		break;
2896	    SPLIT_APPEND(self->str, j, i);
2897	    i = j = i + 1;
2898	} else
2899	    i++;
2900    }
2901    if (j <= len) {
2902	SPLIT_APPEND(self->str, j, len);
2903    }
2904    return list;
2905
2906 onError:
2907    Py_DECREF(list);
2908    return NULL;
2909}
2910
2911static
2912PyObject *split_substring(PyUnicodeObject *self,
2913			  PyObject *list,
2914			  PyUnicodeObject *substring,
2915			  int maxcount)
2916{
2917    register int i;
2918    register int j;
2919    int len = self->length;
2920    int sublen = substring->length;
2921    PyObject *str;
2922
2923    for (i = j = 0; i < len - sublen; ) {
2924	if (Py_UNICODE_MATCH(self, i, substring)) {
2925	    if (maxcount-- <= 0)
2926		break;
2927	    SPLIT_APPEND(self->str, j, i);
2928	    i = j = i + sublen;
2929	} else
2930	    i++;
2931    }
2932    if (j <= len) {
2933	SPLIT_APPEND(self->str, j, len);
2934    }
2935    return list;
2936
2937 onError:
2938    Py_DECREF(list);
2939    return NULL;
2940}
2941
2942#undef SPLIT_APPEND
2943
2944static
2945PyObject *split(PyUnicodeObject *self,
2946		PyUnicodeObject *substring,
2947		int maxcount)
2948{
2949    PyObject *list;
2950
2951    if (maxcount < 0)
2952        maxcount = INT_MAX;
2953
2954    list = PyList_New(0);
2955    if (!list)
2956        return NULL;
2957
2958    if (substring == NULL)
2959	return split_whitespace(self,list,maxcount);
2960
2961    else if (substring->length == 1)
2962	return split_char(self,list,substring->str[0],maxcount);
2963
2964    else if (substring->length == 0) {
2965	Py_DECREF(list);
2966	PyErr_SetString(PyExc_ValueError, "empty separator");
2967	return NULL;
2968    }
2969    else
2970	return split_substring(self,list,substring,maxcount);
2971}
2972
2973static
2974PyObject *strip(PyUnicodeObject *self,
2975		int left,
2976		int right)
2977{
2978    Py_UNICODE *p = self->str;
2979    int start = 0;
2980    int end = self->length;
2981
2982    if (left)
2983        while (start < end && Py_UNICODE_ISSPACE(p[start]))
2984            start++;
2985
2986    if (right)
2987        while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2988            end--;
2989
2990    if (start == 0 && end == self->length) {
2991        /* couldn't strip anything off, return original string */
2992        Py_INCREF(self);
2993        return (PyObject*) self;
2994    }
2995
2996    return (PyObject*) PyUnicode_FromUnicode(
2997        self->str + start,
2998        end - start
2999        );
3000}
3001
3002static
3003PyObject *replace(PyUnicodeObject *self,
3004		  PyUnicodeObject *str1,
3005		  PyUnicodeObject *str2,
3006		  int maxcount)
3007{
3008    PyUnicodeObject *u;
3009
3010    if (maxcount < 0)
3011	maxcount = INT_MAX;
3012
3013    if (str1->length == 1 && str2->length == 1) {
3014        int i;
3015
3016        /* replace characters */
3017        if (!findchar(self->str, self->length, str1->str[0])) {
3018            /* nothing to replace, return original string */
3019            Py_INCREF(self);
3020            u = self;
3021        } else {
3022	    Py_UNICODE u1 = str1->str[0];
3023	    Py_UNICODE u2 = str2->str[0];
3024
3025            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3026                self->str,
3027                self->length
3028                );
3029            if (u)
3030                for (i = 0; i < u->length; i++)
3031                    if (u->str[i] == u1) {
3032                        if (--maxcount < 0)
3033                            break;
3034                        u->str[i] = u2;
3035                    }
3036        }
3037
3038    } else {
3039        int n, i;
3040        Py_UNICODE *p;
3041
3042        /* replace strings */
3043        n = count(self, 0, self->length, str1);
3044        if (n > maxcount)
3045            n = maxcount;
3046        if (n == 0) {
3047            /* nothing to replace, return original string */
3048            Py_INCREF(self);
3049            u = self;
3050        } else {
3051            u = _PyUnicode_New(
3052                self->length + n * (str2->length - str1->length));
3053            if (u) {
3054                i = 0;
3055                p = u->str;
3056                while (i <= self->length - str1->length)
3057                    if (Py_UNICODE_MATCH(self, i, str1)) {
3058                        /* replace string segment */
3059                        Py_UNICODE_COPY(p, str2->str, str2->length);
3060                        p += str2->length;
3061                        i += str1->length;
3062                        if (--n <= 0) {
3063                            /* copy remaining part */
3064                            Py_UNICODE_COPY(p, self->str+i, self->length-i);
3065                            break;
3066                        }
3067                    } else
3068                        *p++ = self->str[i++];
3069            }
3070        }
3071    }
3072
3073    return (PyObject *) u;
3074}
3075
3076/* --- Unicode Object Methods --------------------------------------------- */
3077
3078static char title__doc__[] =
3079"S.title() -> unicode\n\
3080\n\
3081Return a titlecased version of S, i.e. words start with title case\n\
3082characters, all remaining cased characters have lower case.";
3083
3084static PyObject*
3085unicode_title(PyUnicodeObject *self, PyObject *args)
3086{
3087    if (!PyArg_NoArgs(args))
3088        return NULL;
3089    return fixup(self, fixtitle);
3090}
3091
3092static char capitalize__doc__[] =
3093"S.capitalize() -> unicode\n\
3094\n\
3095Return a capitalized version of S, i.e. make the first character\n\
3096have upper case.";
3097
3098static PyObject*
3099unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3100{
3101    if (!PyArg_NoArgs(args))
3102        return NULL;
3103    return fixup(self, fixcapitalize);
3104}
3105
3106#if 0
3107static char capwords__doc__[] =
3108"S.capwords() -> unicode\n\
3109\n\
3110Apply .capitalize() to all words in S and return the result with\n\
3111normalized whitespace (all whitespace strings are replaced by ' ').";
3112
3113static PyObject*
3114unicode_capwords(PyUnicodeObject *self, PyObject *args)
3115{
3116    PyObject *list;
3117    PyObject *item;
3118    int i;
3119
3120    if (!PyArg_NoArgs(args))
3121        return NULL;
3122
3123    /* Split into words */
3124    list = split(self, NULL, -1);
3125    if (!list)
3126        return NULL;
3127
3128    /* Capitalize each word */
3129    for (i = 0; i < PyList_GET_SIZE(list); i++) {
3130        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3131		     fixcapitalize);
3132        if (item == NULL)
3133            goto onError;
3134        Py_DECREF(PyList_GET_ITEM(list, i));
3135        PyList_SET_ITEM(list, i, item);
3136    }
3137
3138    /* Join the words to form a new string */
3139    item = PyUnicode_Join(NULL, list);
3140
3141onError:
3142    Py_DECREF(list);
3143    return (PyObject *)item;
3144}
3145#endif
3146
3147static char center__doc__[] =
3148"S.center(width) -> unicode\n\
3149\n\
3150Return S centered in a Unicode string of length width. Padding is done\n\
3151using spaces.";
3152
3153static PyObject *
3154unicode_center(PyUnicodeObject *self, PyObject *args)
3155{
3156    int marg, left;
3157    int width;
3158
3159    if (!PyArg_ParseTuple(args, "i:center", &width))
3160        return NULL;
3161
3162    if (self->length >= width) {
3163        Py_INCREF(self);
3164        return (PyObject*) self;
3165    }
3166
3167    marg = width - self->length;
3168    left = marg / 2 + (marg & width & 1);
3169
3170    return (PyObject*) pad(self, left, marg - left, ' ');
3171}
3172
3173#if 0
3174
3175/* This code should go into some future Unicode collation support
3176   module. The basic comparison should compare ordinals on a naive
3177   basis (this is what Java does and thus JPython too). */
3178
3179/* speedy UTF-16 code point order comparison */
3180/* gleaned from: */
3181/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3182
3183static short utf16Fixup[32] =
3184{
3185    0, 0, 0, 0, 0, 0, 0, 0,
3186    0, 0, 0, 0, 0, 0, 0, 0,
3187    0, 0, 0, 0, 0, 0, 0, 0,
3188    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3189};
3190
3191static int
3192unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3193{
3194    int len1, len2;
3195
3196    Py_UNICODE *s1 = str1->str;
3197    Py_UNICODE *s2 = str2->str;
3198
3199    len1 = str1->length;
3200    len2 = str2->length;
3201
3202    while (len1 > 0 && len2 > 0) {
3203        Py_UNICODE c1, c2;
3204	long diff;
3205
3206        c1 = *s1++;
3207        c2 = *s2++;
3208	if (c1 > (1<<11) * 26)
3209	    c1 += utf16Fixup[c1>>11];
3210	if (c2 > (1<<11) * 26)
3211            c2 += utf16Fixup[c2>>11];
3212
3213        /* now c1 and c2 are in UTF-32-compatible order */
3214        diff = (long)c1 - (long)c2;
3215        if (diff)
3216            return (diff < 0) ? -1 : (diff != 0);
3217        len1--; len2--;
3218    }
3219
3220    return (len1 < len2) ? -1 : (len1 != len2);
3221}
3222
3223#else
3224
3225static int
3226unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3227{
3228    register int len1, len2;
3229
3230    Py_UNICODE *s1 = str1->str;
3231    Py_UNICODE *s2 = str2->str;
3232
3233    len1 = str1->length;
3234    len2 = str2->length;
3235
3236    while (len1 > 0 && len2 > 0) {
3237	register long diff;
3238
3239        diff = (long)*s1++ - (long)*s2++;
3240        if (diff)
3241            return (diff < 0) ? -1 : (diff != 0);
3242        len1--; len2--;
3243    }
3244
3245    return (len1 < len2) ? -1 : (len1 != len2);
3246}
3247
3248#endif
3249
3250int PyUnicode_Compare(PyObject *left,
3251		      PyObject *right)
3252{
3253    PyUnicodeObject *u = NULL, *v = NULL;
3254    int result;
3255
3256    /* Coerce the two arguments */
3257    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3258    if (u == NULL)
3259	goto onError;
3260    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3261    if (v == NULL)
3262	goto onError;
3263
3264    /* Shortcut for empty or interned objects */
3265    if (v == u) {
3266	Py_DECREF(u);
3267	Py_DECREF(v);
3268	return 0;
3269    }
3270
3271    result = unicode_compare(u, v);
3272
3273    Py_DECREF(u);
3274    Py_DECREF(v);
3275    return result;
3276
3277onError:
3278    Py_XDECREF(u);
3279    Py_XDECREF(v);
3280    return -1;
3281}
3282
3283int PyUnicode_Contains(PyObject *container,
3284		       PyObject *element)
3285{
3286    PyUnicodeObject *u = NULL, *v = NULL;
3287    int result;
3288    register const Py_UNICODE *p, *e;
3289    register Py_UNICODE ch;
3290
3291    /* Coerce the two arguments */
3292    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3293    if (v == NULL) {
3294	PyErr_SetString(PyExc_TypeError,
3295	    "'in <string>' requires character as left operand");
3296	goto onError;
3297    }
3298    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3299    if (u == NULL) {
3300	Py_DECREF(v);
3301	goto onError;
3302    }
3303
3304    /* Check v in u */
3305    if (PyUnicode_GET_SIZE(v) != 1) {
3306	PyErr_SetString(PyExc_TypeError,
3307	    "'in <string>' requires character as left operand");
3308	goto onError;
3309    }
3310    ch = *PyUnicode_AS_UNICODE(v);
3311    p = PyUnicode_AS_UNICODE(u);
3312    e = p + PyUnicode_GET_SIZE(u);
3313    result = 0;
3314    while (p < e) {
3315	if (*p++ == ch) {
3316	    result = 1;
3317	    break;
3318	}
3319    }
3320
3321    Py_DECREF(u);
3322    Py_DECREF(v);
3323    return result;
3324
3325onError:
3326    Py_XDECREF(u);
3327    Py_XDECREF(v);
3328    return -1;
3329}
3330
3331/* Concat to string or Unicode object giving a new Unicode object. */
3332
3333PyObject *PyUnicode_Concat(PyObject *left,
3334			   PyObject *right)
3335{
3336    PyUnicodeObject *u = NULL, *v = NULL, *w;
3337
3338    /* Coerce the two arguments */
3339    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3340    if (u == NULL)
3341	goto onError;
3342    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3343    if (v == NULL)
3344	goto onError;
3345
3346    /* Shortcuts */
3347    if (v == unicode_empty) {
3348	Py_DECREF(v);
3349	return (PyObject *)u;
3350    }
3351    if (u == unicode_empty) {
3352	Py_DECREF(u);
3353	return (PyObject *)v;
3354    }
3355
3356    /* Concat the two Unicode strings */
3357    w = _PyUnicode_New(u->length + v->length);
3358    if (w == NULL)
3359	goto onError;
3360    Py_UNICODE_COPY(w->str, u->str, u->length);
3361    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3362
3363    Py_DECREF(u);
3364    Py_DECREF(v);
3365    return (PyObject *)w;
3366
3367onError:
3368    Py_XDECREF(u);
3369    Py_XDECREF(v);
3370    return NULL;
3371}
3372
3373static char count__doc__[] =
3374"S.count(sub[, start[, end]]) -> int\n\
3375\n\
3376Return the number of occurrences of substring sub in Unicode string\n\
3377S[start:end].  Optional arguments start and end are\n\
3378interpreted as in slice notation.";
3379
3380static PyObject *
3381unicode_count(PyUnicodeObject *self, PyObject *args)
3382{
3383    PyUnicodeObject *substring;
3384    int start = 0;
3385    int end = INT_MAX;
3386    PyObject *result;
3387
3388    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3389		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3390        return NULL;
3391
3392    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3393						(PyObject *)substring);
3394    if (substring == NULL)
3395	return NULL;
3396
3397    if (start < 0)
3398        start += self->length;
3399    if (start < 0)
3400        start = 0;
3401    if (end > self->length)
3402        end = self->length;
3403    if (end < 0)
3404        end += self->length;
3405    if (end < 0)
3406        end = 0;
3407
3408    result = PyInt_FromLong((long) count(self, start, end, substring));
3409
3410    Py_DECREF(substring);
3411    return result;
3412}
3413
3414static char encode__doc__[] =
3415"S.encode([encoding[,errors]]) -> string\n\
3416\n\
3417Return an encoded string version of S. Default encoding is the current\n\
3418default string encoding. errors may be given to set a different error\n\
3419handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3420a ValueError. Other possible values are 'ignore' and 'replace'.";
3421
3422static PyObject *
3423unicode_encode(PyUnicodeObject *self, PyObject *args)
3424{
3425    char *encoding = NULL;
3426    char *errors = NULL;
3427    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3428        return NULL;
3429    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3430}
3431
3432static char expandtabs__doc__[] =
3433"S.expandtabs([tabsize]) -> unicode\n\
3434\n\
3435Return a copy of S where all tab characters are expanded using spaces.\n\
3436If tabsize is not given, a tab size of 8 characters is assumed.";
3437
3438static PyObject*
3439unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3440{
3441    Py_UNICODE *e;
3442    Py_UNICODE *p;
3443    Py_UNICODE *q;
3444    int i, j;
3445    PyUnicodeObject *u;
3446    int tabsize = 8;
3447
3448    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3449	return NULL;
3450
3451    /* First pass: determine size of output string */
3452    i = j = 0;
3453    e = self->str + self->length;
3454    for (p = self->str; p < e; p++)
3455        if (*p == '\t') {
3456	    if (tabsize > 0)
3457		j += tabsize - (j % tabsize);
3458	}
3459        else {
3460            j++;
3461            if (*p == '\n' || *p == '\r') {
3462                i += j;
3463                j = 0;
3464            }
3465        }
3466
3467    /* Second pass: create output string and fill it */
3468    u = _PyUnicode_New(i + j);
3469    if (!u)
3470        return NULL;
3471
3472    j = 0;
3473    q = u->str;
3474
3475    for (p = self->str; p < e; p++)
3476        if (*p == '\t') {
3477	    if (tabsize > 0) {
3478		i = tabsize - (j % tabsize);
3479		j += i;
3480		while (i--)
3481		    *q++ = ' ';
3482	    }
3483	}
3484	else {
3485            j++;
3486	    *q++ = *p;
3487            if (*p == '\n' || *p == '\r')
3488                j = 0;
3489        }
3490
3491    return (PyObject*) u;
3492}
3493
3494static char find__doc__[] =
3495"S.find(sub [,start [,end]]) -> int\n\
3496\n\
3497Return the lowest index in S where substring sub is found,\n\
3498such that sub is contained within s[start,end].  Optional\n\
3499arguments start and end are interpreted as in slice notation.\n\
3500\n\
3501Return -1 on failure.";
3502
3503static PyObject *
3504unicode_find(PyUnicodeObject *self, PyObject *args)
3505{
3506    PyUnicodeObject *substring;
3507    int start = 0;
3508    int end = INT_MAX;
3509    PyObject *result;
3510
3511    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3512		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3513        return NULL;
3514    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3515						(PyObject *)substring);
3516    if (substring == NULL)
3517	return NULL;
3518
3519    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3520
3521    Py_DECREF(substring);
3522    return result;
3523}
3524
3525static PyObject *
3526unicode_getitem(PyUnicodeObject *self, int index)
3527{
3528    if (index < 0 || index >= self->length) {
3529        PyErr_SetString(PyExc_IndexError, "string index out of range");
3530        return NULL;
3531    }
3532
3533    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3534}
3535
3536static long
3537unicode_hash(PyUnicodeObject *self)
3538{
3539    /* Since Unicode objects compare equal to their ASCII string
3540       counterparts, they should use the individual character values
3541       as basis for their hash value.  This is needed to assure that
3542       strings and Unicode objects behave in the same way as
3543       dictionary keys. */
3544
3545    register int len;
3546    register Py_UNICODE *p;
3547    register long x;
3548
3549    if (self->hash != -1)
3550	return self->hash;
3551    len = PyUnicode_GET_SIZE(self);
3552    p = PyUnicode_AS_UNICODE(self);
3553    x = *p << 7;
3554    while (--len >= 0)
3555	x = (1000003*x) ^ *p++;
3556    x ^= PyUnicode_GET_SIZE(self);
3557    if (x == -1)
3558	x = -2;
3559    self->hash = x;
3560    return x;
3561}
3562
3563static char index__doc__[] =
3564"S.index(sub [,start [,end]]) -> int\n\
3565\n\
3566Like S.find() but raise ValueError when the substring is not found.";
3567
3568static PyObject *
3569unicode_index(PyUnicodeObject *self, PyObject *args)
3570{
3571    int result;
3572    PyUnicodeObject *substring;
3573    int start = 0;
3574    int end = INT_MAX;
3575
3576    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3577		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3578        return NULL;
3579
3580    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3581						(PyObject *)substring);
3582    if (substring == NULL)
3583	return NULL;
3584
3585    result = findstring(self, substring, start, end, 1);
3586
3587    Py_DECREF(substring);
3588    if (result < 0) {
3589        PyErr_SetString(PyExc_ValueError, "substring not found");
3590        return NULL;
3591    }
3592    return PyInt_FromLong(result);
3593}
3594
3595static char islower__doc__[] =
3596"S.islower() -> int\n\
3597\n\
3598Return 1 if  all cased characters in S are lowercase and there is\n\
3599at least one cased character in S, 0 otherwise.";
3600
3601static PyObject*
3602unicode_islower(PyUnicodeObject *self, PyObject *args)
3603{
3604    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3605    register const Py_UNICODE *e;
3606    int cased;
3607
3608    if (!PyArg_NoArgs(args))
3609        return NULL;
3610
3611    /* Shortcut for single character strings */
3612    if (PyUnicode_GET_SIZE(self) == 1)
3613	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3614
3615    /* Special case for empty strings */
3616    if (PyString_GET_SIZE(self) == 0)
3617	return PyInt_FromLong(0);
3618
3619    e = p + PyUnicode_GET_SIZE(self);
3620    cased = 0;
3621    for (; p < e; p++) {
3622	register const Py_UNICODE ch = *p;
3623
3624	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3625	    return PyInt_FromLong(0);
3626	else if (!cased && Py_UNICODE_ISLOWER(ch))
3627	    cased = 1;
3628    }
3629    return PyInt_FromLong(cased);
3630}
3631
3632static char isupper__doc__[] =
3633"S.isupper() -> int\n\
3634\n\
3635Return 1 if  all cased characters in S are uppercase and there is\n\
3636at least one cased character in S, 0 otherwise.";
3637
3638static PyObject*
3639unicode_isupper(PyUnicodeObject *self, PyObject *args)
3640{
3641    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3642    register const Py_UNICODE *e;
3643    int cased;
3644
3645    if (!PyArg_NoArgs(args))
3646        return NULL;
3647
3648    /* Shortcut for single character strings */
3649    if (PyUnicode_GET_SIZE(self) == 1)
3650	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3651
3652    /* Special case for empty strings */
3653    if (PyString_GET_SIZE(self) == 0)
3654	return PyInt_FromLong(0);
3655
3656    e = p + PyUnicode_GET_SIZE(self);
3657    cased = 0;
3658    for (; p < e; p++) {
3659	register const Py_UNICODE ch = *p;
3660
3661	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3662	    return PyInt_FromLong(0);
3663	else if (!cased && Py_UNICODE_ISUPPER(ch))
3664	    cased = 1;
3665    }
3666    return PyInt_FromLong(cased);
3667}
3668
3669static char istitle__doc__[] =
3670"S.istitle() -> int\n\
3671\n\
3672Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3673may only follow uncased characters and lowercase characters only cased\n\
3674ones. Return 0 otherwise.";
3675
3676static PyObject*
3677unicode_istitle(PyUnicodeObject *self, PyObject *args)
3678{
3679    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3680    register const Py_UNICODE *e;
3681    int cased, previous_is_cased;
3682
3683    if (!PyArg_NoArgs(args))
3684        return NULL;
3685
3686    /* Shortcut for single character strings */
3687    if (PyUnicode_GET_SIZE(self) == 1)
3688	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3689			      (Py_UNICODE_ISUPPER(*p) != 0));
3690
3691    /* Special case for empty strings */
3692    if (PyString_GET_SIZE(self) == 0)
3693	return PyInt_FromLong(0);
3694
3695    e = p + PyUnicode_GET_SIZE(self);
3696    cased = 0;
3697    previous_is_cased = 0;
3698    for (; p < e; p++) {
3699	register const Py_UNICODE ch = *p;
3700
3701	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3702	    if (previous_is_cased)
3703		return PyInt_FromLong(0);
3704	    previous_is_cased = 1;
3705	    cased = 1;
3706	}
3707	else if (Py_UNICODE_ISLOWER(ch)) {
3708	    if (!previous_is_cased)
3709		return PyInt_FromLong(0);
3710	    previous_is_cased = 1;
3711	    cased = 1;
3712	}
3713	else
3714	    previous_is_cased = 0;
3715    }
3716    return PyInt_FromLong(cased);
3717}
3718
3719static char isspace__doc__[] =
3720"S.isspace() -> int\n\
3721\n\
3722Return 1 if there are only whitespace characters in S,\n\
37230 otherwise.";
3724
3725static PyObject*
3726unicode_isspace(PyUnicodeObject *self, PyObject *args)
3727{
3728    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3729    register const Py_UNICODE *e;
3730
3731    if (!PyArg_NoArgs(args))
3732        return NULL;
3733
3734    /* Shortcut for single character strings */
3735    if (PyUnicode_GET_SIZE(self) == 1 &&
3736	Py_UNICODE_ISSPACE(*p))
3737	return PyInt_FromLong(1);
3738
3739    /* Special case for empty strings */
3740    if (PyString_GET_SIZE(self) == 0)
3741	return PyInt_FromLong(0);
3742
3743    e = p + PyUnicode_GET_SIZE(self);
3744    for (; p < e; p++) {
3745	if (!Py_UNICODE_ISSPACE(*p))
3746	    return PyInt_FromLong(0);
3747    }
3748    return PyInt_FromLong(1);
3749}
3750
3751static char isalpha__doc__[] =
3752"S.isalpha() -> int\n\
3753\n\
3754Return 1 if  all characters in S are alphabetic\n\
3755and there is at least one character in S, 0 otherwise.";
3756
3757static PyObject*
3758unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3759{
3760    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3761    register const Py_UNICODE *e;
3762
3763    if (!PyArg_NoArgs(args))
3764        return NULL;
3765
3766    /* Shortcut for single character strings */
3767    if (PyUnicode_GET_SIZE(self) == 1 &&
3768	Py_UNICODE_ISALPHA(*p))
3769	return PyInt_FromLong(1);
3770
3771    /* Special case for empty strings */
3772    if (PyString_GET_SIZE(self) == 0)
3773	return PyInt_FromLong(0);
3774
3775    e = p + PyUnicode_GET_SIZE(self);
3776    for (; p < e; p++) {
3777	if (!Py_UNICODE_ISALPHA(*p))
3778	    return PyInt_FromLong(0);
3779    }
3780    return PyInt_FromLong(1);
3781}
3782
3783static char isalnum__doc__[] =
3784"S.isalnum() -> int\n\
3785\n\
3786Return 1 if  all characters in S are alphanumeric\n\
3787and there is at least one character in S, 0 otherwise.";
3788
3789static PyObject*
3790unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3791{
3792    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3793    register const Py_UNICODE *e;
3794
3795    if (!PyArg_NoArgs(args))
3796        return NULL;
3797
3798    /* Shortcut for single character strings */
3799    if (PyUnicode_GET_SIZE(self) == 1 &&
3800	Py_UNICODE_ISALNUM(*p))
3801	return PyInt_FromLong(1);
3802
3803    /* Special case for empty strings */
3804    if (PyString_GET_SIZE(self) == 0)
3805	return PyInt_FromLong(0);
3806
3807    e = p + PyUnicode_GET_SIZE(self);
3808    for (; p < e; p++) {
3809	if (!Py_UNICODE_ISALNUM(*p))
3810	    return PyInt_FromLong(0);
3811    }
3812    return PyInt_FromLong(1);
3813}
3814
3815static char isdecimal__doc__[] =
3816"S.isdecimal() -> int\n\
3817\n\
3818Return 1 if there are only decimal characters in S,\n\
38190 otherwise.";
3820
3821static PyObject*
3822unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3823{
3824    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3825    register const Py_UNICODE *e;
3826
3827    if (!PyArg_NoArgs(args))
3828        return NULL;
3829
3830    /* Shortcut for single character strings */
3831    if (PyUnicode_GET_SIZE(self) == 1 &&
3832	Py_UNICODE_ISDECIMAL(*p))
3833	return PyInt_FromLong(1);
3834
3835    /* Special case for empty strings */
3836    if (PyString_GET_SIZE(self) == 0)
3837	return PyInt_FromLong(0);
3838
3839    e = p + PyUnicode_GET_SIZE(self);
3840    for (; p < e; p++) {
3841	if (!Py_UNICODE_ISDECIMAL(*p))
3842	    return PyInt_FromLong(0);
3843    }
3844    return PyInt_FromLong(1);
3845}
3846
3847static char isdigit__doc__[] =
3848"S.isdigit() -> int\n\
3849\n\
3850Return 1 if there are only digit characters in S,\n\
38510 otherwise.";
3852
3853static PyObject*
3854unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3855{
3856    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3857    register const Py_UNICODE *e;
3858
3859    if (!PyArg_NoArgs(args))
3860        return NULL;
3861
3862    /* Shortcut for single character strings */
3863    if (PyUnicode_GET_SIZE(self) == 1 &&
3864	Py_UNICODE_ISDIGIT(*p))
3865	return PyInt_FromLong(1);
3866
3867    /* Special case for empty strings */
3868    if (PyString_GET_SIZE(self) == 0)
3869	return PyInt_FromLong(0);
3870
3871    e = p + PyUnicode_GET_SIZE(self);
3872    for (; p < e; p++) {
3873	if (!Py_UNICODE_ISDIGIT(*p))
3874	    return PyInt_FromLong(0);
3875    }
3876    return PyInt_FromLong(1);
3877}
3878
3879static char isnumeric__doc__[] =
3880"S.isnumeric() -> int\n\
3881\n\
3882Return 1 if there are only numeric characters in S,\n\
38830 otherwise.";
3884
3885static PyObject*
3886unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3887{
3888    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3889    register const Py_UNICODE *e;
3890
3891    if (!PyArg_NoArgs(args))
3892        return NULL;
3893
3894    /* Shortcut for single character strings */
3895    if (PyUnicode_GET_SIZE(self) == 1 &&
3896	Py_UNICODE_ISNUMERIC(*p))
3897	return PyInt_FromLong(1);
3898
3899    /* Special case for empty strings */
3900    if (PyString_GET_SIZE(self) == 0)
3901	return PyInt_FromLong(0);
3902
3903    e = p + PyUnicode_GET_SIZE(self);
3904    for (; p < e; p++) {
3905	if (!Py_UNICODE_ISNUMERIC(*p))
3906	    return PyInt_FromLong(0);
3907    }
3908    return PyInt_FromLong(1);
3909}
3910
3911static char join__doc__[] =
3912"S.join(sequence) -> unicode\n\
3913\n\
3914Return a string which is the concatenation of the strings in the\n\
3915sequence.  The separator between elements is S.";
3916
3917static PyObject*
3918unicode_join(PyUnicodeObject *self, PyObject *args)
3919{
3920    PyObject *data;
3921    if (!PyArg_ParseTuple(args, "O:join", &data))
3922        return NULL;
3923
3924    return PyUnicode_Join((PyObject *)self, data);
3925}
3926
3927static int
3928unicode_length(PyUnicodeObject *self)
3929{
3930    return self->length;
3931}
3932
3933static char ljust__doc__[] =
3934"S.ljust(width) -> unicode\n\
3935\n\
3936Return S left justified in a Unicode string of length width. Padding is\n\
3937done using spaces.";
3938
3939static PyObject *
3940unicode_ljust(PyUnicodeObject *self, PyObject *args)
3941{
3942    int width;
3943    if (!PyArg_ParseTuple(args, "i:ljust", &width))
3944        return NULL;
3945
3946    if (self->length >= width) {
3947        Py_INCREF(self);
3948        return (PyObject*) self;
3949    }
3950
3951    return (PyObject*) pad(self, 0, width - self->length, ' ');
3952}
3953
3954static char lower__doc__[] =
3955"S.lower() -> unicode\n\
3956\n\
3957Return a copy of the string S converted to lowercase.";
3958
3959static PyObject*
3960unicode_lower(PyUnicodeObject *self, PyObject *args)
3961{
3962    if (!PyArg_NoArgs(args))
3963        return NULL;
3964    return fixup(self, fixlower);
3965}
3966
3967static char lstrip__doc__[] =
3968"S.lstrip() -> unicode\n\
3969\n\
3970Return a copy of the string S with leading whitespace removed.";
3971
3972static PyObject *
3973unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3974{
3975    if (!PyArg_NoArgs(args))
3976        return NULL;
3977    return strip(self, 1, 0);
3978}
3979
3980static PyObject*
3981unicode_repeat(PyUnicodeObject *str, int len)
3982{
3983    PyUnicodeObject *u;
3984    Py_UNICODE *p;
3985
3986    if (len < 0)
3987        len = 0;
3988
3989    if (len == 1) {
3990        /* no repeat, return original string */
3991        Py_INCREF(str);
3992        return (PyObject*) str;
3993    }
3994
3995    u = _PyUnicode_New(len * str->length);
3996    if (!u)
3997        return NULL;
3998
3999    p = u->str;
4000
4001    while (len-- > 0) {
4002        Py_UNICODE_COPY(p, str->str, str->length);
4003        p += str->length;
4004    }
4005
4006    return (PyObject*) u;
4007}
4008
4009PyObject *PyUnicode_Replace(PyObject *obj,
4010			    PyObject *subobj,
4011			    PyObject *replobj,
4012			    int maxcount)
4013{
4014    PyObject *self;
4015    PyObject *str1;
4016    PyObject *str2;
4017    PyObject *result;
4018
4019    self = PyUnicode_FromObject(obj);
4020    if (self == NULL)
4021	return NULL;
4022    str1 = PyUnicode_FromObject(subobj);
4023    if (str1 == NULL) {
4024	Py_DECREF(self);
4025	return NULL;
4026    }
4027    str2 = PyUnicode_FromObject(replobj);
4028    if (str2 == NULL) {
4029	Py_DECREF(self);
4030	Py_DECREF(str1);
4031	return NULL;
4032    }
4033    result = replace((PyUnicodeObject *)self,
4034		     (PyUnicodeObject *)str1,
4035		     (PyUnicodeObject *)str2,
4036		     maxcount);
4037    Py_DECREF(self);
4038    Py_DECREF(str1);
4039    Py_DECREF(str2);
4040    return result;
4041}
4042
4043static char replace__doc__[] =
4044"S.replace (old, new[, maxsplit]) -> unicode\n\
4045\n\
4046Return a copy of S with all occurrences of substring\n\
4047old replaced by new.  If the optional argument maxsplit is\n\
4048given, only the first maxsplit occurrences are replaced.";
4049
4050static PyObject*
4051unicode_replace(PyUnicodeObject *self, PyObject *args)
4052{
4053    PyUnicodeObject *str1;
4054    PyUnicodeObject *str2;
4055    int maxcount = -1;
4056    PyObject *result;
4057
4058    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4059        return NULL;
4060    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4061    if (str1 == NULL)
4062	return NULL;
4063    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4064    if (str2 == NULL)
4065	return NULL;
4066
4067    result = replace(self, str1, str2, maxcount);
4068
4069    Py_DECREF(str1);
4070    Py_DECREF(str2);
4071    return result;
4072}
4073
4074static
4075PyObject *unicode_repr(PyObject *unicode)
4076{
4077    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4078				PyUnicode_GET_SIZE(unicode),
4079				1);
4080}
4081
4082static char rfind__doc__[] =
4083"S.rfind(sub [,start [,end]]) -> int\n\
4084\n\
4085Return the highest index in S where substring sub is found,\n\
4086such that sub is contained within s[start,end].  Optional\n\
4087arguments start and end are interpreted as in slice notation.\n\
4088\n\
4089Return -1 on failure.";
4090
4091static PyObject *
4092unicode_rfind(PyUnicodeObject *self, PyObject *args)
4093{
4094    PyUnicodeObject *substring;
4095    int start = 0;
4096    int end = INT_MAX;
4097    PyObject *result;
4098
4099    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4100		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4101        return NULL;
4102    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4103						(PyObject *)substring);
4104    if (substring == NULL)
4105	return NULL;
4106
4107    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4108
4109    Py_DECREF(substring);
4110    return result;
4111}
4112
4113static char rindex__doc__[] =
4114"S.rindex(sub [,start [,end]]) -> int\n\
4115\n\
4116Like S.rfind() but raise ValueError when the substring is not found.";
4117
4118static PyObject *
4119unicode_rindex(PyUnicodeObject *self, PyObject *args)
4120{
4121    int result;
4122    PyUnicodeObject *substring;
4123    int start = 0;
4124    int end = INT_MAX;
4125
4126    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4127		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4128        return NULL;
4129    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4130						(PyObject *)substring);
4131    if (substring == NULL)
4132	return NULL;
4133
4134    result = findstring(self, substring, start, end, -1);
4135
4136    Py_DECREF(substring);
4137    if (result < 0) {
4138        PyErr_SetString(PyExc_ValueError, "substring not found");
4139        return NULL;
4140    }
4141    return PyInt_FromLong(result);
4142}
4143
4144static char rjust__doc__[] =
4145"S.rjust(width) -> unicode\n\
4146\n\
4147Return S right justified in a Unicode string of length width. Padding is\n\
4148done using spaces.";
4149
4150static PyObject *
4151unicode_rjust(PyUnicodeObject *self, PyObject *args)
4152{
4153    int width;
4154    if (!PyArg_ParseTuple(args, "i:rjust", &width))
4155        return NULL;
4156
4157    if (self->length >= width) {
4158        Py_INCREF(self);
4159        return (PyObject*) self;
4160    }
4161
4162    return (PyObject*) pad(self, width - self->length, 0, ' ');
4163}
4164
4165static char rstrip__doc__[] =
4166"S.rstrip() -> unicode\n\
4167\n\
4168Return a copy of the string S with trailing whitespace removed.";
4169
4170static PyObject *
4171unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4172{
4173    if (!PyArg_NoArgs(args))
4174        return NULL;
4175    return strip(self, 0, 1);
4176}
4177
4178static PyObject*
4179unicode_slice(PyUnicodeObject *self, int start, int end)
4180{
4181    /* standard clamping */
4182    if (start < 0)
4183        start = 0;
4184    if (end < 0)
4185        end = 0;
4186    if (end > self->length)
4187        end = self->length;
4188    if (start == 0 && end == self->length) {
4189        /* full slice, return original string */
4190        Py_INCREF(self);
4191        return (PyObject*) self;
4192    }
4193    if (start > end)
4194        start = end;
4195    /* copy slice */
4196    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4197					     end - start);
4198}
4199
4200PyObject *PyUnicode_Split(PyObject *s,
4201			  PyObject *sep,
4202			  int maxsplit)
4203{
4204    PyObject *result;
4205
4206    s = PyUnicode_FromObject(s);
4207    if (s == NULL)
4208	return NULL;
4209    if (sep != NULL) {
4210	sep = PyUnicode_FromObject(sep);
4211	if (sep == NULL) {
4212	    Py_DECREF(s);
4213	    return NULL;
4214	}
4215    }
4216
4217    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4218
4219    Py_DECREF(s);
4220    Py_XDECREF(sep);
4221    return result;
4222}
4223
4224static char split__doc__[] =
4225"S.split([sep [,maxsplit]]) -> list of strings\n\
4226\n\
4227Return a list of the words in S, using sep as the\n\
4228delimiter string.  If maxsplit is given, at most maxsplit\n\
4229splits are done. If sep is not specified, any whitespace string\n\
4230is a separator.";
4231
4232static PyObject*
4233unicode_split(PyUnicodeObject *self, PyObject *args)
4234{
4235    PyObject *substring = Py_None;
4236    int maxcount = -1;
4237
4238    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4239        return NULL;
4240
4241    if (substring == Py_None)
4242	return split(self, NULL, maxcount);
4243    else if (PyUnicode_Check(substring))
4244	return split(self, (PyUnicodeObject *)substring, maxcount);
4245    else
4246	return PyUnicode_Split((PyObject *)self, substring, maxcount);
4247}
4248
4249static char splitlines__doc__[] =
4250"S.splitlines([keepends]]) -> list of strings\n\
4251\n\
4252Return a list of the lines in S, breaking at line boundaries.\n\
4253Line breaks are not included in the resulting list unless keepends\n\
4254is given and true.";
4255
4256static PyObject*
4257unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4258{
4259    int keepends = 0;
4260
4261    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4262        return NULL;
4263
4264    return PyUnicode_Splitlines((PyObject *)self, keepends);
4265}
4266
4267static
4268PyObject *unicode_str(PyUnicodeObject *self)
4269{
4270    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4271}
4272
4273static char strip__doc__[] =
4274"S.strip() -> unicode\n\
4275\n\
4276Return a copy of S with leading and trailing whitespace removed.";
4277
4278static PyObject *
4279unicode_strip(PyUnicodeObject *self, PyObject *args)
4280{
4281    if (!PyArg_NoArgs(args))
4282        return NULL;
4283    return strip(self, 1, 1);
4284}
4285
4286static char swapcase__doc__[] =
4287"S.swapcase() -> unicode\n\
4288\n\
4289Return a copy of S with uppercase characters converted to lowercase\n\
4290and vice versa.";
4291
4292static PyObject*
4293unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4294{
4295    if (!PyArg_NoArgs(args))
4296        return NULL;
4297    return fixup(self, fixswapcase);
4298}
4299
4300static char translate__doc__[] =
4301"S.translate(table) -> unicode\n\
4302\n\
4303Return a copy of the string S, where all characters have been mapped\n\
4304through the given translation table, which must be a mapping of\n\
4305Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4306are left untouched. Characters mapped to None are deleted.";
4307
4308static PyObject*
4309unicode_translate(PyUnicodeObject *self, PyObject *args)
4310{
4311    PyObject *table;
4312
4313    if (!PyArg_ParseTuple(args, "O:translate", &table))
4314	return NULL;
4315    return PyUnicode_TranslateCharmap(self->str,
4316				      self->length,
4317				      table,
4318				      "ignore");
4319}
4320
4321static char upper__doc__[] =
4322"S.upper() -> unicode\n\
4323\n\
4324Return a copy of S converted to uppercase.";
4325
4326static PyObject*
4327unicode_upper(PyUnicodeObject *self, PyObject *args)
4328{
4329    if (!PyArg_NoArgs(args))
4330        return NULL;
4331    return fixup(self, fixupper);
4332}
4333
4334#if 0
4335static char zfill__doc__[] =
4336"S.zfill(width) -> unicode\n\
4337\n\
4338Pad a numeric string x with zeros on the left, to fill a field\n\
4339of the specified width. The string x is never truncated.";
4340
4341static PyObject *
4342unicode_zfill(PyUnicodeObject *self, PyObject *args)
4343{
4344    int fill;
4345    PyUnicodeObject *u;
4346
4347    int width;
4348    if (!PyArg_ParseTuple(args, "i:zfill", &width))
4349        return NULL;
4350
4351    if (self->length >= width) {
4352        Py_INCREF(self);
4353        return (PyObject*) self;
4354    }
4355
4356    fill = width - self->length;
4357
4358    u = pad(self, fill, 0, '0');
4359
4360    if (u->str[fill] == '+' || u->str[fill] == '-') {
4361        /* move sign to beginning of string */
4362        u->str[0] = u->str[fill];
4363        u->str[fill] = '0';
4364    }
4365
4366    return (PyObject*) u;
4367}
4368#endif
4369
4370#if 0
4371static PyObject*
4372unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4373{
4374    if (!PyArg_NoArgs(args))
4375        return NULL;
4376    return PyInt_FromLong(unicode_freelist_size);
4377}
4378#endif
4379
4380static char startswith__doc__[] =
4381"S.startswith(prefix[, start[, end]]) -> int\n\
4382\n\
4383Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4384optional start, test S beginning at that position.  With optional end, stop\n\
4385comparing S at that position.";
4386
4387static PyObject *
4388unicode_startswith(PyUnicodeObject *self,
4389		   PyObject *args)
4390{
4391    PyUnicodeObject *substring;
4392    int start = 0;
4393    int end = INT_MAX;
4394    PyObject *result;
4395
4396    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4397		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4398	return NULL;
4399    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4400						(PyObject *)substring);
4401    if (substring == NULL)
4402	return NULL;
4403
4404    result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4405
4406    Py_DECREF(substring);
4407    return result;
4408}
4409
4410
4411static char endswith__doc__[] =
4412"S.endswith(suffix[, start[, end]]) -> int\n\
4413\n\
4414Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4415optional start, test S beginning at that position.  With optional end, stop\n\
4416comparing S at that position.";
4417
4418static PyObject *
4419unicode_endswith(PyUnicodeObject *self,
4420		 PyObject *args)
4421{
4422    PyUnicodeObject *substring;
4423    int start = 0;
4424    int end = INT_MAX;
4425    PyObject *result;
4426
4427    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4428		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4429	return NULL;
4430    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4431						(PyObject *)substring);
4432    if (substring == NULL)
4433	return NULL;
4434
4435    result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4436
4437    Py_DECREF(substring);
4438    return result;
4439}
4440
4441
4442static PyMethodDef unicode_methods[] = {
4443
4444    /* Order is according to common usage: often used methods should
4445       appear first, since lookup is done sequentially. */
4446
4447    {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4448    {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4449    {"split", (PyCFunction) unicode_split, 1, split__doc__},
4450    {"join", (PyCFunction) unicode_join, 1, join__doc__},
4451    {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4452    {"title", (PyCFunction) unicode_title, 0, title__doc__},
4453    {"center", (PyCFunction) unicode_center, 1, center__doc__},
4454    {"count", (PyCFunction) unicode_count, 1, count__doc__},
4455    {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4456    {"find", (PyCFunction) unicode_find, 1, find__doc__},
4457    {"index", (PyCFunction) unicode_index, 1, index__doc__},
4458    {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4459    {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4460    {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4461/*  {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4462    {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4463    {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4464    {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4465    {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4466    {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4467    {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4468    {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4469    {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4470    {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4471    {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4472    {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4473    {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4474    {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4475    {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4476    {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4477    {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4478    {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4479    {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4480    {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4481    {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4482#if 0
4483    {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4484    {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4485#endif
4486
4487#if 0
4488    /* This one is just used for debugging the implementation. */
4489    {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4490#endif
4491
4492    {NULL, NULL}
4493};
4494
4495static PyObject *
4496unicode_getattr(PyUnicodeObject *self, char *name)
4497{
4498    return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4499}
4500
4501static PySequenceMethods unicode_as_sequence = {
4502    (inquiry) unicode_length, 		/* sq_length */
4503    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
4504    (intargfunc) unicode_repeat, 	/* sq_repeat */
4505    (intargfunc) unicode_getitem, 	/* sq_item */
4506    (intintargfunc) unicode_slice, 	/* sq_slice */
4507    0, 					/* sq_ass_item */
4508    0, 					/* sq_ass_slice */
4509    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
4510};
4511
4512static int
4513unicode_buffer_getreadbuf(PyUnicodeObject *self,
4514			  int index,
4515			  const void **ptr)
4516{
4517    if (index != 0) {
4518        PyErr_SetString(PyExc_SystemError,
4519			"accessing non-existent unicode segment");
4520        return -1;
4521    }
4522    *ptr = (void *) self->str;
4523    return PyUnicode_GET_DATA_SIZE(self);
4524}
4525
4526static int
4527unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4528			   const void **ptr)
4529{
4530    PyErr_SetString(PyExc_TypeError,
4531		    "cannot use unicode as modifyable buffer");
4532    return -1;
4533}
4534
4535static int
4536unicode_buffer_getsegcount(PyUnicodeObject *self,
4537			   int *lenp)
4538{
4539    if (lenp)
4540        *lenp = PyUnicode_GET_DATA_SIZE(self);
4541    return 1;
4542}
4543
4544static int
4545unicode_buffer_getcharbuf(PyUnicodeObject *self,
4546			  int index,
4547			  const void **ptr)
4548{
4549    PyObject *str;
4550
4551    if (index != 0) {
4552        PyErr_SetString(PyExc_SystemError,
4553			"accessing non-existent unicode segment");
4554        return -1;
4555    }
4556    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4557    if (str == NULL)
4558	return -1;
4559    *ptr = (void *) PyString_AS_STRING(str);
4560    return PyString_GET_SIZE(str);
4561}
4562
4563/* Helpers for PyUnicode_Format() */
4564
4565static PyObject *
4566getnextarg(PyObject *args, int arglen, int *p_argidx)
4567{
4568    int argidx = *p_argidx;
4569    if (argidx < arglen) {
4570	(*p_argidx)++;
4571	if (arglen < 0)
4572	    return args;
4573	else
4574	    return PyTuple_GetItem(args, argidx);
4575    }
4576    PyErr_SetString(PyExc_TypeError,
4577		    "not enough arguments for format string");
4578    return NULL;
4579}
4580
4581#define F_LJUST (1<<0)
4582#define F_SIGN	(1<<1)
4583#define F_BLANK (1<<2)
4584#define F_ALT	(1<<3)
4585#define F_ZERO	(1<<4)
4586
4587static
4588int usprintf(register Py_UNICODE *buffer, char *format, ...)
4589{
4590    register int i;
4591    int len;
4592    va_list va;
4593    char *charbuffer;
4594    va_start(va, format);
4595
4596    /* First, format the string as char array, then expand to Py_UNICODE
4597       array. */
4598    charbuffer = (char *)buffer;
4599    len = vsprintf(charbuffer, format, va);
4600    for (i = len - 1; i >= 0; i--)
4601	buffer[i] = (Py_UNICODE) charbuffer[i];
4602
4603    va_end(va);
4604    return len;
4605}
4606
4607static int
4608formatfloat(Py_UNICODE *buf,
4609	    size_t buflen,
4610	    int flags,
4611	    int prec,
4612	    int type,
4613	    PyObject *v)
4614{
4615    /* fmt = '%#.' + `prec` + `type`
4616       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4617    char fmt[20];
4618    double x;
4619
4620    x = PyFloat_AsDouble(v);
4621    if (x == -1.0 && PyErr_Occurred())
4622	return -1;
4623    if (prec < 0)
4624	prec = 6;
4625    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4626	type = 'g';
4627    sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4628    /* worst case length calc to ensure no buffer overrun:
4629         fmt = %#.<prec>g
4630         buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4631            for any double rep.)
4632         len = 1 + prec + 1 + 2 + 5 = 9 + prec
4633       If prec=0 the effective precision is 1 (the leading digit is
4634       always given), therefore increase by one to 10+prec. */
4635    if (buflen <= (size_t)10 + (size_t)prec) {
4636	PyErr_SetString(PyExc_OverflowError,
4637	    "formatted float is too long (precision too long?)");
4638	return -1;
4639    }
4640    return usprintf(buf, fmt, x);
4641}
4642
4643static int
4644formatint(Py_UNICODE *buf,
4645	  size_t buflen,
4646	  int flags,
4647	  int prec,
4648	  int type,
4649	  PyObject *v)
4650{
4651    /* fmt = '%#.' + `prec` + 'l' + `type`
4652       worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
4653    char fmt[20];
4654    long x;
4655
4656    x = PyInt_AsLong(v);
4657    if (x == -1 && PyErr_Occurred())
4658	return -1;
4659    if (prec < 0)
4660	prec = 1;
4661    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4662       worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4663    if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4664        PyErr_SetString(PyExc_OverflowError,
4665            "formatted integer is too long (precision too long?)");
4666        return -1;
4667    }
4668    sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4669    return usprintf(buf, fmt, x);
4670}
4671
4672static int
4673formatchar(Py_UNICODE *buf,
4674           size_t buflen,
4675           PyObject *v)
4676{
4677    /* presume that the buffer is at least 2 characters long */
4678    if (PyUnicode_Check(v)) {
4679	if (PyUnicode_GET_SIZE(v) != 1)
4680	    goto onError;
4681	buf[0] = PyUnicode_AS_UNICODE(v)[0];
4682    }
4683
4684    else if (PyString_Check(v)) {
4685	if (PyString_GET_SIZE(v) != 1)
4686	    goto onError;
4687	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4688    }
4689
4690    else {
4691	/* Integer input truncated to a character */
4692        long x;
4693	x = PyInt_AsLong(v);
4694	if (x == -1 && PyErr_Occurred())
4695	    goto onError;
4696	buf[0] = (char) x;
4697    }
4698    buf[1] = '\0';
4699    return 1;
4700
4701 onError:
4702    PyErr_SetString(PyExc_TypeError,
4703		    "%c requires int or char");
4704    return -1;
4705}
4706
4707/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4708
4709   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4710   chars are formatted. XXX This is a magic number. Each formatting
4711   routine does bounds checking to ensure no overflow, but a better
4712   solution may be to malloc a buffer of appropriate size for each
4713   format. For now, the current solution is sufficient.
4714*/
4715#define FORMATBUFLEN (size_t)120
4716
4717PyObject *PyUnicode_Format(PyObject *format,
4718			   PyObject *args)
4719{
4720    Py_UNICODE *fmt, *res;
4721    int fmtcnt, rescnt, reslen, arglen, argidx;
4722    int args_owned = 0;
4723    PyUnicodeObject *result = NULL;
4724    PyObject *dict = NULL;
4725    PyObject *uformat;
4726
4727    if (format == NULL || args == NULL) {
4728	PyErr_BadInternalCall();
4729	return NULL;
4730    }
4731    uformat = PyUnicode_FromObject(format);
4732    if (uformat == NULL)
4733	return NULL;
4734    fmt = PyUnicode_AS_UNICODE(uformat);
4735    fmtcnt = PyUnicode_GET_SIZE(uformat);
4736
4737    reslen = rescnt = fmtcnt + 100;
4738    result = _PyUnicode_New(reslen);
4739    if (result == NULL)
4740	goto onError;
4741    res = PyUnicode_AS_UNICODE(result);
4742
4743    if (PyTuple_Check(args)) {
4744	arglen = PyTuple_Size(args);
4745	argidx = 0;
4746    }
4747    else {
4748	arglen = -1;
4749	argidx = -2;
4750    }
4751    if (args->ob_type->tp_as_mapping)
4752	dict = args;
4753
4754    while (--fmtcnt >= 0) {
4755	if (*fmt != '%') {
4756	    if (--rescnt < 0) {
4757		rescnt = fmtcnt + 100;
4758		reslen += rescnt;
4759		if (_PyUnicode_Resize(result, reslen) < 0)
4760		    return NULL;
4761		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4762		--rescnt;
4763	    }
4764	    *res++ = *fmt++;
4765	}
4766	else {
4767	    /* Got a format specifier */
4768	    int flags = 0;
4769	    int width = -1;
4770	    int prec = -1;
4771	    int size = 0;
4772	    Py_UNICODE c = '\0';
4773	    Py_UNICODE fill;
4774	    PyObject *v = NULL;
4775	    PyObject *temp = NULL;
4776	    Py_UNICODE *pbuf;
4777	    Py_UNICODE sign;
4778	    int len;
4779	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4780
4781	    fmt++;
4782	    if (*fmt == '(') {
4783		Py_UNICODE *keystart;
4784		int keylen;
4785		PyObject *key;
4786		int pcount = 1;
4787
4788		if (dict == NULL) {
4789		    PyErr_SetString(PyExc_TypeError,
4790				    "format requires a mapping");
4791		    goto onError;
4792		}
4793		++fmt;
4794		--fmtcnt;
4795		keystart = fmt;
4796		/* Skip over balanced parentheses */
4797		while (pcount > 0 && --fmtcnt >= 0) {
4798		    if (*fmt == ')')
4799			--pcount;
4800		    else if (*fmt == '(')
4801			++pcount;
4802		    fmt++;
4803		}
4804		keylen = fmt - keystart - 1;
4805		if (fmtcnt < 0 || pcount > 0) {
4806		    PyErr_SetString(PyExc_ValueError,
4807				    "incomplete format key");
4808		    goto onError;
4809		}
4810		/* keys are converted to strings using UTF-8 and
4811		   then looked up since Python uses strings to hold
4812		   variables names etc. in its namespaces and we
4813		   wouldn't want to break common idioms. */
4814		key = PyUnicode_EncodeUTF8(keystart,
4815					   keylen,
4816					   NULL);
4817		if (key == NULL)
4818		    goto onError;
4819		if (args_owned) {
4820		    Py_DECREF(args);
4821		    args_owned = 0;
4822		}
4823		args = PyObject_GetItem(dict, key);
4824		Py_DECREF(key);
4825		if (args == NULL) {
4826		    goto onError;
4827		}
4828		args_owned = 1;
4829		arglen = -1;
4830		argidx = -2;
4831	    }
4832	    while (--fmtcnt >= 0) {
4833		switch (c = *fmt++) {
4834		case '-': flags |= F_LJUST; continue;
4835		case '+': flags |= F_SIGN; continue;
4836		case ' ': flags |= F_BLANK; continue;
4837		case '#': flags |= F_ALT; continue;
4838		case '0': flags |= F_ZERO; continue;
4839		}
4840		break;
4841	    }
4842	    if (c == '*') {
4843		v = getnextarg(args, arglen, &argidx);
4844		if (v == NULL)
4845		    goto onError;
4846		if (!PyInt_Check(v)) {
4847		    PyErr_SetString(PyExc_TypeError,
4848				    "* wants int");
4849		    goto onError;
4850		}
4851		width = PyInt_AsLong(v);
4852		if (width < 0) {
4853		    flags |= F_LJUST;
4854		    width = -width;
4855		}
4856		if (--fmtcnt >= 0)
4857		    c = *fmt++;
4858	    }
4859	    else if (c >= '0' && c <= '9') {
4860		width = c - '0';
4861		while (--fmtcnt >= 0) {
4862		    c = *fmt++;
4863		    if (c < '0' || c > '9')
4864			break;
4865		    if ((width*10) / 10 != width) {
4866			PyErr_SetString(PyExc_ValueError,
4867					"width too big");
4868			goto onError;
4869		    }
4870		    width = width*10 + (c - '0');
4871		}
4872	    }
4873	    if (c == '.') {
4874		prec = 0;
4875		if (--fmtcnt >= 0)
4876		    c = *fmt++;
4877		if (c == '*') {
4878		    v = getnextarg(args, arglen, &argidx);
4879		    if (v == NULL)
4880			goto onError;
4881		    if (!PyInt_Check(v)) {
4882			PyErr_SetString(PyExc_TypeError,
4883					"* wants int");
4884			goto onError;
4885		    }
4886		    prec = PyInt_AsLong(v);
4887		    if (prec < 0)
4888			prec = 0;
4889		    if (--fmtcnt >= 0)
4890			c = *fmt++;
4891		}
4892		else if (c >= '0' && c <= '9') {
4893		    prec = c - '0';
4894		    while (--fmtcnt >= 0) {
4895			c = Py_CHARMASK(*fmt++);
4896			if (c < '0' || c > '9')
4897			    break;
4898			if ((prec*10) / 10 != prec) {
4899			    PyErr_SetString(PyExc_ValueError,
4900					    "prec too big");
4901			    goto onError;
4902			}
4903			prec = prec*10 + (c - '0');
4904		    }
4905		}
4906	    } /* prec */
4907	    if (fmtcnt >= 0) {
4908		if (c == 'h' || c == 'l' || c == 'L') {
4909		    size = c;
4910		    if (--fmtcnt >= 0)
4911			c = *fmt++;
4912		}
4913	    }
4914	    if (fmtcnt < 0) {
4915		PyErr_SetString(PyExc_ValueError,
4916				"incomplete format");
4917		goto onError;
4918	    }
4919	    if (c != '%') {
4920		v = getnextarg(args, arglen, &argidx);
4921		if (v == NULL)
4922		    goto onError;
4923	    }
4924	    sign = 0;
4925	    fill = ' ';
4926	    switch (c) {
4927
4928	    case '%':
4929		pbuf = formatbuf;
4930		/* presume that buffer length is at least 1 */
4931		pbuf[0] = '%';
4932		len = 1;
4933		break;
4934
4935	    case 's':
4936	    case 'r':
4937		if (PyUnicode_Check(v) && c == 's') {
4938		    temp = v;
4939		    Py_INCREF(temp);
4940		}
4941		else {
4942		    PyObject *unicode;
4943		    if (c == 's')
4944			temp = PyObject_Str(v);
4945		    else
4946			temp = PyObject_Repr(v);
4947		    if (temp == NULL)
4948			goto onError;
4949		    if (!PyString_Check(temp)) {
4950			/* XXX Note: this should never happen, since
4951   			       PyObject_Repr() and PyObject_Str() assure
4952			       this */
4953			Py_DECREF(temp);
4954			PyErr_SetString(PyExc_TypeError,
4955					"%s argument has non-string str()");
4956			goto onError;
4957		    }
4958		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
4959						   PyString_GET_SIZE(temp),
4960					       NULL,
4961						   "strict");
4962		    Py_DECREF(temp);
4963		    temp = unicode;
4964		    if (temp == NULL)
4965			goto onError;
4966		}
4967		pbuf = PyUnicode_AS_UNICODE(temp);
4968		len = PyUnicode_GET_SIZE(temp);
4969		if (prec >= 0 && len > prec)
4970		    len = prec;
4971		break;
4972
4973	    case 'i':
4974	    case 'd':
4975	    case 'u':
4976	    case 'o':
4977	    case 'x':
4978	    case 'X':
4979		if (c == 'i')
4980		    c = 'd';
4981		pbuf = formatbuf;
4982		len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4983			flags, prec, c, v);
4984		if (len < 0)
4985		    goto onError;
4986		sign = (c == 'd');
4987		if (flags & F_ZERO) {
4988		    fill = '0';
4989		    if ((flags&F_ALT) &&
4990			(c == 'x' || c == 'X') &&
4991			pbuf[0] == '0' && pbuf[1] == c) {
4992			*res++ = *pbuf++;
4993			*res++ = *pbuf++;
4994			rescnt -= 2;
4995			len -= 2;
4996			width -= 2;
4997			if (width < 0)
4998			    width = 0;
4999		    }
5000		}
5001		break;
5002
5003	    case 'e':
5004	    case 'E':
5005	    case 'f':
5006	    case 'g':
5007	    case 'G':
5008		pbuf = formatbuf;
5009		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5010			flags, prec, c, v);
5011		if (len < 0)
5012		    goto onError;
5013		sign = 1;
5014		if (flags&F_ZERO)
5015		    fill = '0';
5016		break;
5017
5018	    case 'c':
5019		pbuf = formatbuf;
5020		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5021		if (len < 0)
5022		    goto onError;
5023		break;
5024
5025	    default:
5026		PyErr_Format(PyExc_ValueError,
5027			     "unsupported format character '%c' (0x%x)",
5028			     c, c);
5029		goto onError;
5030	    }
5031	    if (sign) {
5032		if (*pbuf == '-' || *pbuf == '+') {
5033		    sign = *pbuf++;
5034		    len--;
5035		}
5036		else if (flags & F_SIGN)
5037		    sign = '+';
5038		else if (flags & F_BLANK)
5039		    sign = ' ';
5040		else
5041		    sign = 0;
5042	    }
5043	    if (width < len)
5044		width = len;
5045	    if (rescnt < width + (sign != 0)) {
5046		reslen -= rescnt;
5047		rescnt = width + fmtcnt + 100;
5048		reslen += rescnt;
5049		if (_PyUnicode_Resize(result, reslen) < 0)
5050		    return NULL;
5051		res = PyUnicode_AS_UNICODE(result)
5052		    + reslen - rescnt;
5053	    }
5054	    if (sign) {
5055		if (fill != ' ')
5056		    *res++ = sign;
5057		rescnt--;
5058		if (width > len)
5059		    width--;
5060	    }
5061	    if (width > len && !(flags & F_LJUST)) {
5062		do {
5063		    --rescnt;
5064		    *res++ = fill;
5065		} while (--width > len);
5066	    }
5067	    if (sign && fill == ' ')
5068		*res++ = sign;
5069	    memcpy(res, pbuf, len * sizeof(Py_UNICODE));
5070	    res += len;
5071	    rescnt -= len;
5072	    while (--width >= len) {
5073		--rescnt;
5074		*res++ = ' ';
5075	    }
5076	    if (dict && (argidx < arglen) && c != '%') {
5077		PyErr_SetString(PyExc_TypeError,
5078				"not all arguments converted");
5079		goto onError;
5080	    }
5081	    Py_XDECREF(temp);
5082	} /* '%' */
5083    } /* until end */
5084    if (argidx < arglen && !dict) {
5085	PyErr_SetString(PyExc_TypeError,
5086			"not all arguments converted");
5087	goto onError;
5088    }
5089
5090    if (args_owned) {
5091	Py_DECREF(args);
5092    }
5093    Py_DECREF(uformat);
5094    if (_PyUnicode_Resize(result, reslen - rescnt))
5095	goto onError;
5096    return (PyObject *)result;
5097
5098 onError:
5099    Py_XDECREF(result);
5100    Py_DECREF(uformat);
5101    if (args_owned) {
5102	Py_DECREF(args);
5103    }
5104    return NULL;
5105}
5106
5107static PyBufferProcs unicode_as_buffer = {
5108    (getreadbufferproc) unicode_buffer_getreadbuf,
5109    (getwritebufferproc) unicode_buffer_getwritebuf,
5110    (getsegcountproc) unicode_buffer_getsegcount,
5111    (getcharbufferproc) unicode_buffer_getcharbuf,
5112};
5113
5114PyTypeObject PyUnicode_Type = {
5115    PyObject_HEAD_INIT(&PyType_Type)
5116    0, 					/* ob_size */
5117    "unicode", 				/* tp_name */
5118    sizeof(PyUnicodeObject), 		/* tp_size */
5119    0, 					/* tp_itemsize */
5120    /* Slots */
5121    (destructor)_PyUnicode_Free, 	/* tp_dealloc */
5122    0, 					/* tp_print */
5123    (getattrfunc)unicode_getattr, 	/* tp_getattr */
5124    0, 					/* tp_setattr */
5125    (cmpfunc) unicode_compare, 		/* tp_compare */
5126    (reprfunc) unicode_repr, 		/* tp_repr */
5127    0, 					/* tp_as_number */
5128    &unicode_as_sequence, 		/* tp_as_sequence */
5129    0, 					/* tp_as_mapping */
5130    (hashfunc) unicode_hash, 		/* tp_hash*/
5131    0, 					/* tp_call*/
5132    (reprfunc) unicode_str,	 	/* tp_str */
5133    (getattrofunc) NULL, 		/* tp_getattro */
5134    (setattrofunc) NULL, 		/* tp_setattro */
5135    &unicode_as_buffer,			/* tp_as_buffer */
5136    Py_TPFLAGS_DEFAULT,			/* tp_flags */
5137};
5138
5139/* Initialize the Unicode implementation */
5140
5141void _PyUnicode_Init(void)
5142{
5143    /* Doublecheck the configuration... */
5144    if (sizeof(Py_UNICODE) != 2)
5145        Py_FatalError("Unicode configuration error: "
5146		      "sizeof(Py_UNICODE) != 2 bytes");
5147
5148    /* Init the implementation */
5149    unicode_freelist = NULL;
5150    unicode_freelist_size = 0;
5151    unicode_empty = _PyUnicode_New(0);
5152    strcpy(unicode_default_encoding, "ascii");
5153}
5154
5155/* Finalize the Unicode implementation */
5156
5157void
5158_PyUnicode_Fini(void)
5159{
5160    PyUnicodeObject *u = unicode_freelist;
5161
5162    while (u != NULL) {
5163	PyUnicodeObject *v = u;
5164	u = *(PyUnicodeObject **)u;
5165	if (v->str)
5166	    PyMem_DEL(v->str);
5167	Py_XDECREF(v->defenc);
5168	PyObject_DEL(v);
5169    }
5170    unicode_freelist = NULL;
5171    unicode_freelist_size = 0;
5172    Py_XDECREF(unicode_empty);
5173    unicode_empty = NULL;
5174}
5175