unicodeobject.h revision 004d64f362eb0bd0d3e2f257b2b7721fecba87af
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
10(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
11
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python.  This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62   wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71#  define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77   through the interface functions PyUnicode_FromWideChar() and
78   PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
81# define HAVE_WCHAR_H
82#endif
83
84#ifdef HAVE_WCHAR_H
85/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
86# ifdef _HAVE_BSDI
87#  include <time.h>
88# endif
89# include "wchar.h"
90#endif
91
92#ifdef HAVE_USABLE_WCHAR_T
93
94/* If the compiler defines whcar_t as a 16-bit unsigned type we can
95   use the compiler type directly.  Works fine with all modern Windows
96   platforms. */
97
98typedef wchar_t Py_UNICODE;
99
100#else
101
102/* Use if you have a standard ANSI compiler, without wchar_t support.
103   If a short is not 16 bits on your platform, you have to fix the
104   typedef below, or the module initialization code will complain. */
105
106typedef unsigned short Py_UNICODE;
107
108#endif
109
110/* --- Internal Unicode Operations ---------------------------------------- */
111
112/* If you want Python to use the compiler's wctype.h functions instead
113   of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
114   configure Python using --with-ctype-functions.  This reduces the
115   interpreter's code size. */
116
117#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
118
119#include "wctype.h"
120
121#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
122
123#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
124#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
125#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
126#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
127
128#define Py_UNICODE_TOLOWER(ch) towlower(ch)
129#define Py_UNICODE_TOUPPER(ch) towupper(ch)
130#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
131
132#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
133#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
134#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
135
136#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
137#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
138#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
139
140#else
141
142#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
143
144#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
145#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
146#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
147#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
148
149#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
150#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
151#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
152
153#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
154#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
155#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
161#endif
162
163#define Py_UNICODE_COPY(target, source, length)\
164    (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
165
166#define Py_UNICODE_FILL(target, value, length) do\
167    {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
168    while (0)
169
170#define Py_UNICODE_MATCH(string, offset, substring)\
171    (!memcmp((string)->str + (offset), (substring)->str,\
172             (substring)->length*sizeof(Py_UNICODE)))
173
174#ifdef __cplusplus
175extern "C" {
176#endif
177
178/* --- Unicode Type ------------------------------------------------------- */
179
180typedef struct {
181    PyObject_HEAD
182    int length;			/* Length of raw Unicode data in buffer */
183    Py_UNICODE *str;		/* Raw Unicode buffer */
184    long hash;			/* Hash value; -1 if not set */
185    PyObject *utf8str;		/* UTF-8 encoded version as Python string,
186				   or NULL */
187} PyUnicodeObject;
188
189extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
190
191#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
192
193/* Fast access macros */
194#define PyUnicode_GET_SIZE(op) \
195        (((PyUnicodeObject *)(op))->length)
196#define PyUnicode_GET_DATA_SIZE(op) \
197        (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
198#define PyUnicode_AS_UNICODE(op) \
199        (((PyUnicodeObject *)(op))->str)
200#define PyUnicode_AS_DATA(op) \
201        ((const char *)((PyUnicodeObject *)(op))->str)
202
203/* --- Constants ---------------------------------------------------------- */
204
205/* This Unicode character will be used as replacement character during
206   decoding if the errors argument is set to "replace". Note: the
207   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
208   Unicode 3.0. */
209
210#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
211
212/* === Public API ========================================================= */
213
214/* --- Plain Py_UNICODE --------------------------------------------------- */
215
216/* Create a Unicode Object from the Py_UNICODE buffer u of the given
217   size. u may be NULL which causes the contents to be undefined. It
218   is the user's responsibility to fill in the needed data.
219
220   The buffer is copied into the new object. */
221
222extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
223    const Py_UNICODE *u,        /* Unicode buffer */
224    int size                    /* size of buffer */
225    );
226
227/* Return a read-only pointer to the Unicode object's internal
228   Py_UNICODE buffer. */
229
230extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
231    PyObject *unicode	 	/* Unicode object */
232    );
233
234/* Get the length of the Unicode object. */
235
236extern DL_IMPORT(int) PyUnicode_GetSize(
237    PyObject *unicode	 	/* Unicode object */
238    );
239
240/* Resize an already allocated Unicode object to the new size length.
241
242   *unicode is modified to point to the new (resized) object and 0
243   returned on success.
244
245   This API may only be called by the function which also called the
246   Unicode constructor. The refcount on the object must be 1. Otherwise,
247   an error is returned.
248
249   Error handling is implemented as follows: an exception is set, -1
250   is returned and *unicode left untouched.
251
252*/
253
254extern DL_IMPORT(int) PyUnicode_Resize(
255    PyObject **unicode,		/* Pointer to the Unicode object */
256    int length			/* New length */
257    );
258
259/* Coerce obj to an Unicode object and return a reference with
260   *incremented* refcount.
261
262   Coercion is done in the following way:
263
264   1. Unicode objects are passed back as-is with incremented
265      refcount.
266
267   2. String and other char buffer compatible objects are decoded
268      under the assumptions that they contain UTF-8 data. Decoding
269      is done in "strict" mode.
270
271   3. All other objects raise an exception.
272
273   The API returns NULL in case of an error. The caller is responsible
274   for decref'ing the returned objects.
275
276*/
277
278extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
279    register PyObject *obj 	/* Object */
280    );
281
282/* --- wchar_t support for platforms which support it --------------------- */
283
284#ifdef HAVE_WCHAR_H
285
286/* Create a Unicode Object from the whcar_t buffer w of the given
287   size.
288
289   The buffer is copied into the new object. */
290
291extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
292    register const wchar_t *w,  /* wchar_t buffer */
293    int size                    /* size of buffer */
294    );
295
296/* Copies the Unicode Object contents into the whcar_t buffer w.  At
297   most size wchar_t characters are copied.
298
299   Returns the number of wchar_t characters copied or -1 in case of an
300   error. */
301
302extern DL_IMPORT(int) PyUnicode_AsWideChar(
303    PyUnicodeObject *unicode,   /* Unicode object */
304    register wchar_t *w,        /* wchar_t buffer */
305    int size                    /* size of buffer */
306    );
307
308#endif
309
310/* === Builtin Codecs =====================================================
311
312   Many of these APIs take two arguments encoding and errors. These
313   parameters encoding and errors have the same semantics as the ones
314   of the builtin unicode() API.
315
316   Setting encoding to NULL causes the default encoding to be used
317   which is UTF-8.
318
319   Error handling is set by errors which may also be set to NULL
320   meaning to use the default handling defined for the codec. Default
321   error handling for all builtin codecs is "strict" (ValueErrors are
322   raised).
323
324   The codecs all use a similar interface. Only deviation from the
325   generic ones are documented.
326
327*/
328
329/* --- Generic Codecs ----------------------------------------------------- */
330
331/* Create a Unicode object by decoding the encoded string s of the
332   given size. */
333
334extern DL_IMPORT(PyObject*) PyUnicode_Decode(
335    const char *s,              /* encoded string */
336    int size,                   /* size of buffer */
337    const char *encoding,       /* encoding */
338    const char *errors          /* error handling */
339    );
340
341/* Encodes a Py_UNICODE buffer of the given size and returns a
342   Python string object. */
343
344extern DL_IMPORT(PyObject*) PyUnicode_Encode(
345    const Py_UNICODE *s,        /* Unicode char buffer */
346    int size,                   /* number of Py_UNICODE chars to encode */
347    const char *encoding,       /* encoding */
348    const char *errors          /* error handling */
349    );
350
351/* Encodes a Unicode object and returns the result as Python string
352   object. */
353
354extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
355    PyObject *unicode,	 	/* Unicode object */
356    const char *encoding,	/* encoding */
357    const char *errors		/* error handling */
358    );
359
360/* --- UTF-8 Codecs ------------------------------------------------------- */
361
362extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
363    const char *string, 	/* UTF-8 encoded string */
364    int length,	 		/* size of string */
365    const char *errors		/* error handling */
366    );
367
368extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
369    PyObject *unicode	 	/* Unicode object */
370    );
371
372extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
373    const Py_UNICODE *data, 	/* Unicode char buffer */
374    int length,	 		/* number of Py_UNICODE chars to encode */
375    const char *errors		/* error handling */
376    );
377
378/* --- UTF-16 Codecs ------------------------------------------------------ */
379
380/* Decodes length bytes from a UTF-16 encoded buffer string and returns
381   the corresponding Unicode object.
382
383   errors (if non-NULL) defines the error handling. It defaults
384   to "strict".
385
386   If byteorder is non-NULL, the decoder starts decoding using the
387   given byte order:
388
389	*byteorder == -1: little endian
390	*byteorder == 0:  native order
391	*byteorder == 1:  big endian
392
393   and then switches according to all BOM marks it finds in the input
394   data. BOM marks are not copied into the resulting Unicode string.
395   After completion, *byteorder is set to the current byte order at
396   the end of input data.
397
398   If byteorder is NULL, the codec starts in native order mode.
399
400*/
401
402extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
403    const char *string, 	/* UTF-16 encoded string */
404    int length,	 		/* size of string */
405    const char *errors,		/* error handling */
406    int *byteorder		/* pointer to byteorder to use
407				   0=native;-1=LE,1=BE; updated on
408				   exit */
409    );
410
411/* Returns a Python string using the UTF-16 encoding in native byte
412   order. The string always starts with a BOM mark.  */
413
414extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
415    PyObject *unicode	 	/* Unicode object */
416    );
417
418/* Returns a Python string object holding the UTF-16 encoded value of
419   the Unicode data.
420
421   If byteorder is not 0, output is written according to the following
422   byte order:
423
424   byteorder == -1: little endian
425   byteorder == 0:  native byte order (writes a BOM mark)
426   byteorder == 1:  big endian
427
428   If byteorder is 0, the output string will always start with the
429   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
430   prepended.
431
432   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
433   UCS-2. This trick makes it possible to add full UTF-16 capabilities
434   at a later point without comprimising the APIs.
435
436*/
437
438extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
439    const Py_UNICODE *data, 	/* Unicode char buffer */
440    int length,	 		/* number of Py_UNICODE chars to encode */
441    const char *errors,		/* error handling */
442    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
443    );
444
445/* --- Unicode-Escape Codecs ---------------------------------------------- */
446
447extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
448    const char *string, 	/* Unicode-Escape encoded string */
449    int length,	 		/* size of string */
450    const char *errors		/* error handling */
451    );
452
453extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
454    PyObject *unicode	 	/* Unicode object */
455    );
456
457extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
458    const Py_UNICODE *data, 	/* Unicode char buffer */
459    int length	 		/* Number of Py_UNICODE chars to encode */
460    );
461
462/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
463
464extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
465    const char *string, 	/* Raw-Unicode-Escape encoded string */
466    int length,	 		/* size of string */
467    const char *errors		/* error handling */
468    );
469
470extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
471    PyObject *unicode	 	/* Unicode object */
472    );
473
474extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
475    const Py_UNICODE *data, 	/* Unicode char buffer */
476    int length	 		/* Number of Py_UNICODE chars to encode */
477    );
478
479/* --- Latin-1 Codecs -----------------------------------------------------
480
481   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
482
483*/
484
485extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
486    const char *string, 	/* Latin-1 encoded string */
487    int length,	 		/* size of string */
488    const char *errors		/* error handling */
489    );
490
491extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
492    PyObject *unicode	 	/* Unicode object */
493    );
494
495extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
496    const Py_UNICODE *data, 	/* Unicode char buffer */
497    int length,	 		/* Number of Py_UNICODE chars to encode */
498    const char *errors		/* error handling */
499    );
500
501/* --- ASCII Codecs -------------------------------------------------------
502
503   Only 7-bit ASCII data is excepted. All other codes generate errors.
504
505*/
506
507extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
508    const char *string, 	/* ASCII encoded string */
509    int length,	 		/* size of string */
510    const char *errors		/* error handling */
511    );
512
513extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
514    PyObject *unicode	 	/* Unicode object */
515    );
516
517extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
518    const Py_UNICODE *data, 	/* Unicode char buffer */
519    int length,	 		/* Number of Py_UNICODE chars to encode */
520    const char *errors		/* error handling */
521    );
522
523/* --- Character Map Codecs -----------------------------------------------
524
525   This codec uses mappings to encode and decode characters.
526
527   Decoding mappings must map single string characters to single
528   Unicode characters, integers (which are then interpreted as Unicode
529   ordinals) or None (meaning "undefined mapping" and causing an
530   error).
531
532   Encoding mappings must map single Unicode characters to single
533   string characters, integers (which are then interpreted as Latin-1
534   ordinals) or None (meaning "undefined mapping" and causing an
535   error).
536
537   If a character lookup fails with a LookupError, the character is
538   copied as-is meaning that its ordinal value will be interpreted as
539   Unicode or Latin-1 ordinal resp. Because of this mappings only need
540   to contain those mappings which map characters to different code
541   points.
542
543*/
544
545extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
546    const char *string, 	/* Encoded string */
547    int length,	 		/* size of string */
548    PyObject *mapping,		/* character mapping
549				   (char ordinal -> unicode ordinal) */
550    const char *errors		/* error handling */
551    );
552
553extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
554    PyObject *unicode,	 	/* Unicode object */
555    PyObject *mapping		/* character mapping
556				   (unicode ordinal -> char ordinal) */
557    );
558
559extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
560    const Py_UNICODE *data, 	/* Unicode char buffer */
561    int length,	 		/* Number of Py_UNICODE chars to encode */
562    PyObject *mapping,		/* character mapping
563				   (unicode ordinal -> char ordinal) */
564    const char *errors		/* error handling */
565    );
566
567/* Translate a Py_UNICODE buffer of the given length by applying a
568   character mapping table to it and return the resulting Unicode
569   object.
570
571   The mapping table must map Unicode ordinal integers to Unicode
572   ordinal integers or None (causing deletion of the character).
573
574   Mapping tables may be dictionaries or sequences. Unmapped character
575   ordinals (ones which cause a LookupError) are left untouched and
576   are copied as-is.
577
578*/
579
580extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
581    const Py_UNICODE *data, 	/* Unicode char buffer */
582    int length,	 		/* Number of Py_UNICODE chars to encode */
583    PyObject *table,		/* Translate table */
584    const char *errors		/* error handling */
585    );
586
587#ifdef MS_WIN32
588
589/* --- MBCS codecs for Windows -------------------------------------------- */
590
591extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
592    const char *string,         /* MBCS encoded string */
593    int length,                 /* size of string */
594    const char *errors          /* error handling */
595    );
596
597extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
598    PyObject *unicode           /* Unicode object */
599    );
600
601extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
602    const Py_UNICODE *data,     /* Unicode char buffer */
603    int length,                 /* Number of Py_UNICODE chars to encode */
604    const char *errors          /* error handling */
605    );
606
607#endif /* MS_WIN32 */
608
609/* --- Decimal Encoder ---------------------------------------------------- */
610
611/* Takes a Unicode string holding a decimal value and writes it into
612   an output buffer using standard ASCII digit codes.
613
614   The output buffer has to provide at least length+1 bytes of storage
615   area. The output string is 0-terminated.
616
617   The encoder converts whitespace to ' ', decimal characters to their
618   corresponding ASCII digit and all other Latin-1 characters except
619   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
620   are treated as errors. This includes embedded NULL bytes.
621
622   Error handling is defined by the errors argument:
623
624      NULL or "strict": raise a ValueError
625      "ignore": ignore the wrong characters (these are not copied to the
626		output buffer)
627      "replace": replaces illegal characters with '?'
628
629   Returns 0 on success, -1 on failure.
630
631*/
632
633extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
634    Py_UNICODE *s,		/* Unicode buffer */
635    int length,			/* Number of Py_UNICODE chars to encode */
636    char *output,		/* Output buffer; must have size >= length */
637    const char *errors		/* error handling */
638    );
639
640/* --- Methods & Slots ----------------------------------------------------
641
642   These are capable of handling Unicode objects and strings on input
643   (we refer to them as strings in the descriptions) and return
644   Unicode objects or integers as apporpriate. */
645
646/* Concat two strings giving a new Unicode string. */
647
648extern DL_IMPORT(PyObject*) PyUnicode_Concat(
649    PyObject *left,	 	/* Left string */
650    PyObject *right	 	/* Right string */
651    );
652
653/* Split a string giving a list of Unicode strings.
654
655   If sep is NULL, splitting will be done at all whitespace
656   substrings. Otherwise, splits occur at the given separator.
657
658   At most maxsplit splits will be done. If negative, no limit is set.
659
660   Separators are not included in the resulting list.
661
662*/
663
664extern DL_IMPORT(PyObject*) PyUnicode_Split(
665    PyObject *s,		/* String to split */
666    PyObject *sep,		/* String separator */
667    int maxsplit		/* Maxsplit count */
668    );
669
670/* Dito, but split at line breaks.
671
672   CRLF is considered to be one line break. Line breaks are not
673   included in the resulting list. */
674
675extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
676    PyObject *s,		/* String to split */
677    int keepends		/* If true, line end markers are included */
678    );
679
680/* Translate a string by applying a character mapping table to it and
681   return the resulting Unicode object.
682
683   The mapping table must map Unicode ordinal integers to Unicode
684   ordinal integers or None (causing deletion of the character).
685
686   Mapping tables may be dictionaries or sequences. Unmapped character
687   ordinals (ones which cause a LookupError) are left untouched and
688   are copied as-is.
689
690*/
691
692extern DL_IMPORT(PyObject *) PyUnicode_Translate(
693    PyObject *str,		/* String */
694    PyObject *table,		/* Translate table */
695    const char *errors		/* error handling */
696    );
697
698/* Join a sequence of strings using the given separator and return
699   the resulting Unicode string. */
700
701extern DL_IMPORT(PyObject*) PyUnicode_Join(
702    PyObject *separator, 	/* Separator string */
703    PyObject *seq	 	/* Sequence object */
704    );
705
706/* Return 1 if substr matches str[start:end] at the given tail end, 0
707   otherwise. */
708
709extern DL_IMPORT(int) PyUnicode_Tailmatch(
710    PyObject *str,		/* String */
711    PyObject *substr,		/* Prefix or Suffix string */
712    int start,			/* Start index */
713    int end,			/* Stop index */
714    int direction		/* Tail end: -1 prefix, +1 suffix */
715    );
716
717/* Return the first position of substr in str[start:end] using the
718   given search direction or -1 if not found. */
719
720extern DL_IMPORT(int) PyUnicode_Find(
721    PyObject *str,		/* String */
722    PyObject *substr,		/* Substring to find */
723    int start,			/* Start index */
724    int end,			/* Stop index */
725    int direction		/* Find direction: +1 forward, -1 backward */
726    );
727
728/* Count the number of occurrences of substr in str[start:end]. */
729
730extern DL_IMPORT(int) PyUnicode_Count(
731    PyObject *str,		/* String */
732    PyObject *substr,		/* Substring to count */
733    int start,			/* Start index */
734    int end			/* Stop index */
735    );
736
737/* Replace at most maxcount occurrences of substr in str with replstr
738   and return the resulting Unicode object. */
739
740extern DL_IMPORT(PyObject *) PyUnicode_Replace(
741    PyObject *str,		/* String */
742    PyObject *substr,		/* Substring to find */
743    PyObject *replstr,		/* Substring to replace */
744    int maxcount		/* Max. number of replacements to apply;
745				   -1 = all */
746    );
747
748/* Compare two strings and return -1, 0, 1 for less than, equal,
749   greater than resp. */
750
751extern DL_IMPORT(int) PyUnicode_Compare(
752    PyObject *left,		/* Left string */
753    PyObject *right		/* Right string */
754    );
755
756/* Apply a argument tuple or dictionar to a format string and return
757   the resulting Unicode string. */
758
759extern DL_IMPORT(PyObject *) PyUnicode_Format(
760    PyObject *format,		/* Format string */
761    PyObject *args		/* Argument tuple or dictionary */
762    );
763
764/* Checks whether element is contained in container and return 1/0
765   accordingly.
766
767   element has to coerce to an one element Unicode string. -1 is
768   returned in case of an error. */
769
770extern DL_IMPORT(int) PyUnicode_Contains(
771    PyObject *container,	/* Container string */
772    PyObject *element		/* Element string */
773    );
774
775/* === Characters Type APIs =============================================== */
776
777/* These should not be used directly. Use the Py_UNICODE_IS* and
778   Py_UNICODE_TO* macros instead.
779
780   These APIs are implemented in Objects/unicodectype.c.
781
782*/
783
784extern DL_IMPORT(int) _PyUnicode_IsLowercase(
785    register const Py_UNICODE ch 	/* Unicode character */
786    );
787
788extern DL_IMPORT(int) _PyUnicode_IsUppercase(
789    register const Py_UNICODE ch 	/* Unicode character */
790    );
791
792extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
793    register const Py_UNICODE ch 	/* Unicode character */
794    );
795
796extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
797    register const Py_UNICODE ch 	/* Unicode character */
798    );
799
800extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
801    register const Py_UNICODE ch 	/* Unicode character */
802    );
803
804extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
805    register const Py_UNICODE ch 	/* Unicode character */
806    );
807
808extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
809    register const Py_UNICODE ch 	/* Unicode character */
810    );
811
812extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
813    register const Py_UNICODE ch 	/* Unicode character */
814    );
815
816extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
817    register const Py_UNICODE ch 	/* Unicode character */
818    );
819
820extern DL_IMPORT(int) _PyUnicode_ToDigit(
821    register const Py_UNICODE ch 	/* Unicode character */
822    );
823
824extern DL_IMPORT(double) _PyUnicode_ToNumeric(
825    register const Py_UNICODE ch 	/* Unicode character */
826    );
827
828extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
829    register const Py_UNICODE ch 	/* Unicode character */
830    );
831
832extern DL_IMPORT(int) _PyUnicode_IsDigit(
833    register const Py_UNICODE ch 	/* Unicode character */
834    );
835
836extern DL_IMPORT(int) _PyUnicode_IsNumeric(
837    register const Py_UNICODE ch 	/* Unicode character */
838    );
839
840#ifdef __cplusplus
841}
842#endif
843#endif /* !Py_UNICODEOBJECT_H */
844