unicodeobject.h revision 90db9c47dca4d105835386fc57d46472b0836820
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
12
13Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python.  This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58#include <ctype.h>
59
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
64/* Python 3.x requires unicode */
65#define Py_USING_UNICODE
66
67#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
69#endif
70
71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74   Otherwise, Unicode strings are stored as UCS-2 (with limited support
75   for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Py_UNICODE was the native Unicode storage format (code unit) used by
87   Python and represents a single Unicode element in the Unicode type.
88   With PEP 393, Py_UNICODE is deprecated and replaced with a
89   typedef to wchar_t. */
90
91#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97   through the interface functions PyUnicode_FromWideChar(),
98   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
101# ifndef HAVE_WCHAR_H
102#  define HAVE_WCHAR_H
103# endif
104#endif
105
106#if defined(MS_WINDOWS)
107#  define HAVE_MBCS
108#endif
109
110#ifdef HAVE_WCHAR_H
111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113#  include <time.h>
114# endif
115#  include <wchar.h>
116#endif
117
118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
119   unicode representations. */
120#if SIZEOF_INT == 4
121typedef unsigned int Py_UCS4;
122#elif SIZEOF_LONG == 4
123typedef unsigned long Py_UCS4;
124#else
125#error "Could not find a proper typedef for Py_UCS4"
126#endif
127
128#if SIZEOF_SHORT == 2
129typedef unsigned short Py_UCS2;
130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
134typedef unsigned char Py_UCS1;
135
136/* --- Internal Unicode Operations ---------------------------------------- */
137
138/* Since splitting on whitespace is an important use case, and
139   whitespace in most situations is solely ASCII whitespace, we
140   optimize for the common case by using a quick look-up table
141   _Py_ascii_whitespace (see below) with an inlined check.
142
143 */
144#ifndef Py_LIMITED_API
145#define Py_UNICODE_ISSPACE(ch) \
146    ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
167
168#define Py_UNICODE_ISALNUM(ch) \
169       (Py_UNICODE_ISALPHA(ch) || \
170    Py_UNICODE_ISDECIMAL(ch) || \
171    Py_UNICODE_ISDIGIT(ch) || \
172    Py_UNICODE_ISNUMERIC(ch))
173
174#define Py_UNICODE_COPY(target, source, length) \
175    Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
176
177#define Py_UNICODE_FILL(target, value, length) \
178    do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
179    for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
180    } while (0)
181
182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low)  \
188    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
189      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
190/* high surrogate = top 10 bits added to D800 */
191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
192/* low surrogate = bottom 10 bits added to DC00 */
193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
194
195/* Check if substring matches at given offset.  The offset must be
196   valid, and the substring must not be empty. */
197
198#define Py_UNICODE_MATCH(string, offset, substring) \
199    ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
200     ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
201     !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
202
203#endif /* Py_LIMITED_API */
204
205#ifdef __cplusplus
206extern "C" {
207#endif
208
209/* --- Unicode Type ------------------------------------------------------- */
210
211#ifndef Py_LIMITED_API
212
213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
214   structure. state.ascii and state.compact are set, and the data
215   immediately follow the structure. utf8_length and wstr_length can be found
216   in the length field; the utf8 pointer is equal to the data pointer. */
217typedef struct {
218    /* There are 4 forms of Unicode strings:
219
220       - compact ascii:
221
222         * structure = PyASCIIObject
223         * test: PyUnicode_IS_COMPACT_ASCII(op)
224         * kind = PyUnicode_1BYTE_KIND
225         * compact = 1
226         * ascii = 1
227         * ready = 1
228         * (length is the length of the utf8 and wstr strings)
229         * (data starts just after the structure)
230         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
231
232       - compact:
233
234         * structure = PyCompactUnicodeObject
235         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
236         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
237           PyUnicode_4BYTE_KIND
238         * compact = 1
239         * ready = 1
240         * ascii = 0
241         * utf8 is not shared with data
242         * utf8_length = 0 if utf8 is NULL
243         * wstr is shared with data and wstr_length=length
244           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
245           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
246         * wstr_length = 0 if wstr is NULL
247         * (data starts just after the structure)
248
249       - legacy string, not ready:
250
251         * structure = PyUnicodeObject
252         * test: kind == PyUnicode_WCHAR_KIND
253         * length = 0 (use wstr_length)
254         * hash = -1
255         * kind = PyUnicode_WCHAR_KIND
256         * compact = 0
257         * ascii = 0
258         * ready = 0
259         * interned = SSTATE_NOT_INTERNED
260         * wstr is not NULL
261         * data.any is NULL
262         * utf8 is NULL
263         * utf8_length = 0
264
265       - legacy string, ready:
266
267         * structure = PyUnicodeObject structure
268         * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
269         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
270           PyUnicode_4BYTE_KIND
271         * compact = 0
272         * ready = 1
273         * data.any is not NULL
274         * utf8 is shared and utf8_length = length with data.any if ascii = 1
275         * utf8_length = 0 if utf8 is NULL
276         * wstr is shared with data.any and wstr_length = length
277           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
278           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
279         * wstr_length = 0 if wstr is NULL
280
281       Compact strings use only one memory block (structure + characters),
282       whereas legacy strings use one block for the structure and one block
283       for characters.
284
285       Legacy strings are created by PyUnicode_FromUnicode() and
286       PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
287       when PyUnicode_READY() is called.
288
289       See also _PyUnicode_CheckConsistency().
290    */
291    PyObject_HEAD
292    Py_ssize_t length;          /* Number of code points in the string */
293    Py_hash_t hash;             /* Hash value; -1 if not set */
294    struct {
295        /*
296           SSTATE_NOT_INTERNED (0)
297           SSTATE_INTERNED_MORTAL (1)
298           SSTATE_INTERNED_IMMORTAL (2)
299
300           If interned != SSTATE_NOT_INTERNED, the two references from the
301           dictionary to this object are *not* counted in ob_refcnt.
302         */
303        unsigned int interned:2;
304        /* Character size:
305
306           - PyUnicode_WCHAR_KIND (0):
307
308             * character type = wchar_t (16 or 32 bits, depending on the
309               platform)
310
311           - PyUnicode_1BYTE_KIND (1):
312
313             * character type = Py_UCS1 (8 bits, unsigned)
314             * all characters are in the range U+0000-U+00FF (latin1)
315             * if ascii is set, all characters are in the range U+0000-U+007F
316               (ASCII), otherwise at least one character is in the range
317               U+0080-U+00FF
318
319           - PyUnicode_2BYTE_KIND (2):
320
321             * character type = Py_UCS2 (16 bits, unsigned)
322             * all characters are in the range U+0000-U+FFFF (BMP)
323             * at least one character is in the range U+0100-U+FFFF
324
325           - PyUnicode_4BYTE_KIND (4):
326
327             * character type = Py_UCS4 (32 bits, unsigned)
328             * all characters are in the range U+0000-U+10FFFF
329             * at least one character is in the range U+10000-U+10FFFF
330         */
331        unsigned int kind:3;
332        /* Compact is with respect to the allocation scheme. Compact unicode
333           objects only require one memory block while non-compact objects use
334           one block for the PyUnicodeObject struct and another for its data
335           buffer. */
336        unsigned int compact:1;
337        /* The string only contains characters in the range U+0000-U+007F (ASCII)
338           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
339           set, use the PyASCIIObject structure. */
340        unsigned int ascii:1;
341        /* The ready flag indicates whether the object layout is initialized
342           completely. This means that this is either a compact object, or
343           the data pointer is filled out. The bit is redundant, and helps
344           to minimize the test in PyUnicode_IS_READY(). */
345        unsigned int ready:1;
346    } state;
347    wchar_t *wstr;              /* wchar_t representation (null-terminated) */
348} PyASCIIObject;
349
350/* Non-ASCII strings allocated through PyUnicode_New use the
351   PyCompactUnicodeObject structure. state.compact is set, and the data
352   immediately follow the structure. */
353typedef struct {
354    PyASCIIObject _base;
355    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
356                                 * terminating \0. */
357    char *utf8;                 /* UTF-8 representation (null-terminated) */
358    Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
359                                 * surrogates count as two code points. */
360} PyCompactUnicodeObject;
361
362/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
363   PyUnicodeObject structure. The actual string data is initially in the wstr
364   block, and copied into the data block using _PyUnicode_Ready. */
365typedef struct {
366    PyCompactUnicodeObject _base;
367    union {
368        void *any;
369        Py_UCS1 *latin1;
370        Py_UCS2 *ucs2;
371        Py_UCS4 *ucs4;
372    } data;                     /* Canonical, smallest-form Unicode buffer */
373} PyUnicodeObject;
374#endif
375
376PyAPI_DATA(PyTypeObject) PyUnicode_Type;
377PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
378
379#define PyUnicode_Check(op) \
380                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
381#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
382
383/* Fast access macros */
384#ifndef Py_LIMITED_API
385
386#define PyUnicode_WSTR_LENGTH(op) \
387    (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
388     ((PyASCIIObject*)op)->length :                    \
389     ((PyCompactUnicodeObject*)op)->wstr_length)
390
391/* Returns the deprecated Py_UNICODE representation's size in code units
392   (this includes surrogate pairs as 2 units).
393   If the Py_UNICODE representation is not available, it will be computed
394   on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
395
396#define PyUnicode_GET_SIZE(op)                       \
397    (assert(PyUnicode_Check(op)),                    \
398     (((PyASCIIObject *)(op))->wstr) ?               \
399      PyUnicode_WSTR_LENGTH(op) :                    \
400      ((void)PyUnicode_AsUnicode((PyObject *)(op)),  \
401       assert(((PyASCIIObject *)(op))->wstr),        \
402       PyUnicode_WSTR_LENGTH(op)))
403
404#define PyUnicode_GET_DATA_SIZE(op) \
405    (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
406
407/* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
408   representation on demand.  Using this macro is very inefficient now,
409   try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
410   use PyUnicode_WRITE() and PyUnicode_READ(). */
411
412#define PyUnicode_AS_UNICODE(op) \
413    (assert(PyUnicode_Check(op)), \
414     (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
415      PyUnicode_AsUnicode((PyObject *)(op)))
416
417#define PyUnicode_AS_DATA(op) \
418    ((const char *)(PyUnicode_AS_UNICODE(op)))
419
420
421/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
422
423/* Values for PyASCIIObject.state: */
424
425/* Interning state. */
426#define SSTATE_NOT_INTERNED 0
427#define SSTATE_INTERNED_MORTAL 1
428#define SSTATE_INTERNED_IMMORTAL 2
429
430/* Return true if the string contains only ASCII characters, or 0 if not. The
431   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
432   ready. */
433#define PyUnicode_IS_ASCII(op)                   \
434    (assert(PyUnicode_Check(op)),                \
435     assert(PyUnicode_IS_READY(op)),             \
436     ((PyASCIIObject*)op)->state.ascii)
437
438/* Return true if the string is compact or 0 if not.
439   No type checks or Ready calls are performed. */
440#define PyUnicode_IS_COMPACT(op) \
441    (((PyASCIIObject*)(op))->state.compact)
442
443/* Return true if the string is a compact ASCII string (use PyASCIIObject
444   structure), or 0 if not.  No type checks or Ready calls are performed. */
445#define PyUnicode_IS_COMPACT_ASCII(op)                 \
446    (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
447
448enum PyUnicode_Kind {
449/* String contains only wstr byte characters.  This is only possible
450   when the string was created with a legacy API and _PyUnicode_Ready()
451   has not been called yet.  */
452    PyUnicode_WCHAR_KIND = 0,
453/* Return values of the PyUnicode_KIND() macro: */
454    PyUnicode_1BYTE_KIND = 1,
455    PyUnicode_2BYTE_KIND = 2,
456    PyUnicode_4BYTE_KIND = 4
457};
458
459/* Return pointers to the canonical representation cast to unsigned char,
460   Py_UCS2, or Py_UCS4 for direct character access.
461   No checks are performed, use PyUnicode_KIND() before to ensure
462   these will work correctly. */
463
464#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
465#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
466#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
467
468/* Return one of the PyUnicode_*_KIND values defined above. */
469#define PyUnicode_KIND(op) \
470    (assert(PyUnicode_Check(op)), \
471     assert(PyUnicode_IS_READY(op)),            \
472     ((PyASCIIObject *)(op))->state.kind)
473
474/* Return a void pointer to the raw unicode buffer. */
475#define _PyUnicode_COMPACT_DATA(op)                     \
476    (PyUnicode_IS_ASCII(op) ?                   \
477     ((void*)((PyASCIIObject*)(op) + 1)) :              \
478     ((void*)((PyCompactUnicodeObject*)(op) + 1)))
479
480#define _PyUnicode_NONCOMPACT_DATA(op)                  \
481    (assert(((PyUnicodeObject*)(op))->data.any),        \
482     ((((PyUnicodeObject *)(op))->data.any)))
483
484#define PyUnicode_DATA(op) \
485    (assert(PyUnicode_Check(op)), \
486     PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) :   \
487     _PyUnicode_NONCOMPACT_DATA(op))
488
489/* In the access macros below, "kind" may be evaluated more than once.
490   All other macro parameters are evaluated exactly once, so it is safe
491   to put side effects into them (such as increasing the index). */
492
493/* Write into the canonical representation, this macro does not do any sanity
494   checks and is intended for usage in loops.  The caller should cache the
495   kind and data pointers obtained from other macro calls.
496   index is the index in the string (starts at 0) and value is the new
497   code point value which should be written to that location. */
498#define PyUnicode_WRITE(kind, data, index, value) \
499    do { \
500        switch ((kind)) { \
501        case PyUnicode_1BYTE_KIND: { \
502            ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
503            break; \
504        } \
505        case PyUnicode_2BYTE_KIND: { \
506            ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
507            break; \
508        } \
509        default: { \
510            assert((kind) == PyUnicode_4BYTE_KIND); \
511            ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
512        } \
513        } \
514    } while (0)
515
516/* Read a code point from the string's canonical representation.  No checks
517   or ready calls are performed. */
518#define PyUnicode_READ(kind, data, index) \
519    ((Py_UCS4) \
520    ((kind) == PyUnicode_1BYTE_KIND ? \
521        ((const Py_UCS1 *)(data))[(index)] : \
522        ((kind) == PyUnicode_2BYTE_KIND ? \
523            ((const Py_UCS2 *)(data))[(index)] : \
524            ((const Py_UCS4 *)(data))[(index)] \
525        ) \
526    ))
527
528/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
529   calls PyUnicode_KIND() and might call it twice.  For single reads, use
530   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
531   cache kind and use PyUnicode_READ instead. */
532#define PyUnicode_READ_CHAR(unicode, index) \
533    (assert(PyUnicode_Check(unicode)),          \
534     assert(PyUnicode_IS_READY(unicode)),       \
535     (Py_UCS4)                                  \
536        (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
537            ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
538            (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
539                ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
540                ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
541            ) \
542        ))
543
544/* Returns the length of the unicode string. The caller has to make sure that
545   the string has it's canonical representation set before calling
546   this macro.  Call PyUnicode_(FAST_)Ready to ensure that. */
547#define PyUnicode_GET_LENGTH(op)                \
548    (assert(PyUnicode_Check(op)),               \
549     assert(PyUnicode_IS_READY(op)),            \
550     ((PyASCIIObject *)(op))->length)
551
552
553/* Fast check to determine whether an object is ready. Equivalent to
554   PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
555
556#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
557
558/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
559   case.  If the canonical representation is not yet set, it will still call
560   _PyUnicode_Ready().
561   Returns 0 on success and -1 on errors. */
562#define PyUnicode_READY(op)                        \
563    (assert(PyUnicode_Check(op)),                       \
564     (PyUnicode_IS_READY(op) ?                          \
565      0 : _PyUnicode_Ready((PyObject *)(op))))
566
567/* Return a maximum character value which is suitable for creating another
568   string based on op.  This is always an approximation but more efficient
569   than iterating over the string. */
570#define PyUnicode_MAX_CHAR_VALUE(op) \
571    (assert(PyUnicode_IS_READY(op)),                                    \
572     (PyUnicode_IS_ASCII(op) ?                                          \
573      (0x7f) :                                                          \
574      (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ?                     \
575       (0xffU) :                                                        \
576       (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ?                    \
577        (0xffffU) :                                                     \
578        (0x10ffffU)))))
579
580#endif
581
582/* --- Constants ---------------------------------------------------------- */
583
584/* This Unicode character will be used as replacement character during
585   decoding if the errors argument is set to "replace". Note: the
586   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
587   Unicode 3.0. */
588
589#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
590
591/* === Public API ========================================================= */
592
593/* --- Plain Py_UNICODE --------------------------------------------------- */
594
595/* With PEP 393, this is the recommended way to allocate a new unicode object.
596   This function will allocate the object and its buffer in a single memory
597   block.  Objects created using this function are not resizable. */
598#ifndef Py_LIMITED_API
599PyAPI_FUNC(PyObject*) PyUnicode_New(
600    Py_ssize_t size,            /* Number of code points in the new string */
601    Py_UCS4 maxchar             /* maximum code point value in the string */
602    );
603#endif
604
605/* Initializes the canonical string representation from a the deprecated
606   wstr/Py_UNICODE representation. This function is used to convert Unicode
607   objects which were created using the old API to the new flexible format
608   introduced with PEP 393.
609
610   Don't call this function directly, use the public PyUnicode_READY() macro
611   instead. */
612#ifndef Py_LIMITED_API
613PyAPI_FUNC(int) _PyUnicode_Ready(
614    PyObject *unicode           /* Unicode object */
615    );
616#endif
617
618/* Get a copy of a Unicode string. */
619#ifndef Py_LIMITED_API
620PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
621    PyObject *unicode
622    );
623#endif
624
625/* Copy character from one unicode object into another, this function performs
626   character conversion when necessary and falls back to memcpy() if possible.
627
628   Fail if to is too small (smaller than *how_many* or smaller than
629   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
630   kind(to), or if *to* has more than 1 reference.
631
632   Return the number of written character, or return -1 and raise an exception
633   on error.
634
635   Pseudo-code:
636
637       how_many = min(how_many, len(from) - from_start)
638       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
639       return how_many
640
641   Note: The function doesn't write a terminating null character.
642   */
643#ifndef Py_LIMITED_API
644PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
645    PyObject *to,
646    Py_ssize_t to_start,
647    PyObject *from,
648    Py_ssize_t from_start,
649    Py_ssize_t how_many
650    );
651
652/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
653   may crash if parameters are invalid (e.g. if the output string
654   is too short). */
655PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
656    PyObject *to,
657    Py_ssize_t to_start,
658    PyObject *from,
659    Py_ssize_t from_start,
660    Py_ssize_t how_many
661    );
662#endif
663
664#ifndef Py_LIMITED_API
665/* Fill a string with a character: write fill_char into
666   unicode[start:start+length].
667
668   Fail if fill_char is bigger than the string maximum character, or if the
669   string has more than 1 reference.
670
671   Return the number of written character, or return -1 and raise an exception
672   on error. */
673PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
674    PyObject *unicode,
675    Py_ssize_t start,
676    Py_ssize_t length,
677    Py_UCS4 fill_char
678    );
679
680/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
681   if parameters are invalid (e.g. if length is longer than the string). */
682PyAPI_FUNC(void) _PyUnicode_FastFill(
683    PyObject *unicode,
684    Py_ssize_t start,
685    Py_ssize_t length,
686    Py_UCS4 fill_char
687    );
688#endif
689
690/* Create a Unicode Object from the Py_UNICODE buffer u of the given
691   size.
692
693   u may be NULL which causes the contents to be undefined. It is the
694   user's responsibility to fill in the needed data afterwards. Note
695   that modifying the Unicode object contents after construction is
696   only allowed if u was set to NULL.
697
698   The buffer is copied into the new object. */
699
700#ifndef Py_LIMITED_API
701PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
702    const Py_UNICODE *u,        /* Unicode buffer */
703    Py_ssize_t size             /* size of buffer */
704    );
705#endif
706
707/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
708PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
709    const char *u,             /* UTF-8 encoded string */
710    Py_ssize_t size            /* size of buffer */
711    );
712
713/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
714   UTF-8 encoded bytes.  The size is determined with strlen(). */
715PyAPI_FUNC(PyObject*) PyUnicode_FromString(
716    const char *u              /* UTF-8 encoded string */
717    );
718
719#ifndef Py_LIMITED_API
720/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
721   Scan the string to find the maximum character. */
722PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
723    int kind,
724    const void *buffer,
725    Py_ssize_t size);
726
727/* Create a new string from a buffer of ASCII characters.
728   WARNING: Don't check if the string contains any non-ASCII character. */
729PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
730    const char *buffer,
731    Py_ssize_t size);
732#endif
733
734PyAPI_FUNC(PyObject*) PyUnicode_Substring(
735    PyObject *str,
736    Py_ssize_t start,
737    Py_ssize_t end);
738
739#ifndef Py_LIMITED_API
740/* Compute the maximum character of the substring unicode[start:end].
741   Return 127 for an empty string. */
742PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
743    PyObject *unicode,
744    Py_ssize_t start,
745    Py_ssize_t end);
746#endif
747
748/* Copy the string into a UCS4 buffer including the null character if copy_null
749   is set. Return NULL and raise an exception on error. Raise a ValueError if
750   the buffer is smaller than the string. Return buffer on success.
751
752   buflen is the length of the buffer in (Py_UCS4) characters. */
753PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
754    PyObject *unicode,
755    Py_UCS4* buffer,
756    Py_ssize_t buflen,
757    int copy_null);
758
759/* Copy the string into a UCS4 buffer. A new buffer is allocated using
760 * PyMem_Malloc; if this fails, NULL is returned with a memory error
761   exception set. */
762PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
763
764/* Return a read-only pointer to the Unicode object's internal
765   Py_UNICODE buffer.
766   If the wchar_t/Py_UNICODE representation is not yet available, this
767   function will calculate it. */
768
769#ifndef Py_LIMITED_API
770PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
771    PyObject *unicode           /* Unicode object */
772    );
773#endif
774
775/* Return a read-only pointer to the Unicode object's internal
776   Py_UNICODE buffer and save the length at size.
777   If the wchar_t/Py_UNICODE representation is not yet available, this
778   function will calculate it. */
779
780#ifndef Py_LIMITED_API
781PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
782    PyObject *unicode,          /* Unicode object */
783    Py_ssize_t *size            /* location where to save the length */
784    );
785#endif
786
787/* Get the length of the Unicode object. */
788
789PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
790    PyObject *unicode
791);
792
793/* Get the number of Py_UNICODE units in the
794   string representation. */
795
796PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
797    PyObject *unicode           /* Unicode object */
798    );
799
800/* Read a character from the string. */
801
802PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
803    PyObject *unicode,
804    Py_ssize_t index
805    );
806
807/* Write a character to the string. The string must have been created through
808   PyUnicode_New, must not be shared, and must not have been hashed yet.
809
810   Return 0 on success, -1 on error. */
811
812PyAPI_FUNC(int) PyUnicode_WriteChar(
813    PyObject *unicode,
814    Py_ssize_t index,
815    Py_UCS4 character
816    );
817
818#ifndef Py_LIMITED_API
819/* Get the maximum ordinal for a Unicode character. */
820PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
821#endif
822
823/* Resize an Unicode object. The length is the number of characters, except
824   if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
825   is the number of Py_UNICODE characters.
826
827   *unicode is modified to point to the new (resized) object and 0
828   returned on success.
829
830   Try to resize the string in place (which is usually faster than allocating
831   a new string and copy characters), or create a new string.
832
833   Error handling is implemented as follows: an exception is set, -1
834   is returned and *unicode left untouched.
835
836   WARNING: The function doesn't check string content, the result may not be a
837            string in canonical representation. */
838
839PyAPI_FUNC(int) PyUnicode_Resize(
840    PyObject **unicode,         /* Pointer to the Unicode object */
841    Py_ssize_t length           /* New length */
842    );
843
844/* Coerce obj to an Unicode object and return a reference with
845   *incremented* refcount.
846
847   Coercion is done in the following way:
848
849   1. bytes, bytearray and other char buffer compatible objects are decoded
850      under the assumptions that they contain data using the UTF-8
851      encoding. Decoding is done in "strict" mode.
852
853   2. All other objects (including Unicode objects) raise an
854      exception.
855
856   The API returns NULL in case of an error. The caller is responsible
857   for decref'ing the returned objects.
858
859*/
860
861PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
862    register PyObject *obj,     /* Object */
863    const char *encoding,       /* encoding */
864    const char *errors          /* error handling */
865    );
866
867/* Coerce obj to an Unicode object and return a reference with
868   *incremented* refcount.
869
870   Unicode objects are passed back as-is (subclasses are converted to
871   true Unicode objects), all other objects are delegated to
872   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
873   using UTF-8 encoding as basis for decoding the object.
874
875   The API returns NULL in case of an error. The caller is responsible
876   for decref'ing the returned objects.
877
878*/
879
880PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
881    register PyObject *obj      /* Object */
882    );
883
884PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
885    const char *format,   /* ASCII-encoded string  */
886    va_list vargs
887    );
888PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
889    const char *format,   /* ASCII-encoded string  */
890    ...
891    );
892
893#ifndef Py_LIMITED_API
894typedef struct {
895    PyObject *buffer;
896    void *data;
897    enum PyUnicode_Kind kind;
898    Py_UCS4 maxchar;
899    Py_ssize_t size;
900    Py_ssize_t pos;
901    /* minimum length of the buffer when overallocation is enabled,
902       see _PyUnicodeWriter_Init() */
903    Py_ssize_t min_length;
904    unsigned char overallocate;
905    /* If readonly is 1, buffer is a shared string (cannot be modified)
906       and size is set to 0. */
907    unsigned char readonly;
908} _PyUnicodeWriter ;
909
910/* Initialize a Unicode writer.
911
912   If min_length is greater than zero, _PyUnicodeWriter_Prepare()
913   overallocates the buffer and min_length is the minimum length in characters
914   of the buffer. */
915PyAPI_FUNC(void)
916_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length);
917
918/* Prepare the buffer to write 'length' characters
919   with the specified maximum character.
920
921   Return 0 on success, raise an exception and return -1 on error. */
922#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
923    (((MAXCHAR) <= (WRITER)->maxchar                                  \
924      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
925     ? 0                                                              \
926     : (((LENGTH) == 0)                                               \
927        ? 0                                                           \
928        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
929
930/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
931   instead. */
932PyAPI_FUNC(int)
933_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
934                                 Py_ssize_t length, Py_UCS4 maxchar);
935
936PyAPI_FUNC(int)
937_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str);
938
939PyAPI_FUNC(PyObject *)
940_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
941
942PyAPI_FUNC(void)
943_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
944#endif
945
946#ifndef Py_LIMITED_API
947/* Format the object based on the format_spec, as defined in PEP 3101
948   (Advanced String Formatting). */
949PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
950    _PyUnicodeWriter *writer,
951    PyObject *obj,
952    PyObject *format_spec,
953    Py_ssize_t start,
954    Py_ssize_t end);
955#endif
956
957PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
958PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
959PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
960    const char *u              /* UTF-8 encoded string */
961    );
962#ifndef Py_LIMITED_API
963PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
964#endif
965
966/* Use only if you know it's a string */
967#define PyUnicode_CHECK_INTERNED(op) \
968    (((PyASCIIObject *)(op))->state.interned)
969
970/* --- wchar_t support for platforms which support it --------------------- */
971
972#ifdef HAVE_WCHAR_H
973
974/* Create a Unicode Object from the wchar_t buffer w of the given
975   size.
976
977   The buffer is copied into the new object. */
978
979PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
980    register const wchar_t *w,  /* wchar_t buffer */
981    Py_ssize_t size             /* size of buffer */
982    );
983
984/* Copies the Unicode Object contents into the wchar_t buffer w.  At
985   most size wchar_t characters are copied.
986
987   Note that the resulting wchar_t string may or may not be
988   0-terminated.  It is the responsibility of the caller to make sure
989   that the wchar_t string is 0-terminated in case this is required by
990   the application.
991
992   Returns the number of wchar_t characters copied (excluding a
993   possibly trailing 0-termination character) or -1 in case of an
994   error. */
995
996PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
997    PyObject *unicode,          /* Unicode object */
998    register wchar_t *w,        /* wchar_t buffer */
999    Py_ssize_t size             /* size of buffer */
1000    );
1001
1002/* Convert the Unicode object to a wide character string. The output string
1003   always ends with a nul character. If size is not NULL, write the number of
1004   wide characters (excluding the null character) into *size.
1005
1006   Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
1007   on success. On error, returns NULL, *size is undefined and raises a
1008   MemoryError. */
1009
1010PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
1011    PyObject *unicode,          /* Unicode object */
1012    Py_ssize_t *size            /* number of characters of the result */
1013    );
1014
1015#ifndef Py_LIMITED_API
1016PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
1017#endif
1018
1019#endif
1020
1021/* --- Unicode ordinals --------------------------------------------------- */
1022
1023/* Create a Unicode Object from the given Unicode code point ordinal.
1024
1025   The ordinal must be in range(0x10000) on narrow Python builds
1026   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
1027   raised in case it is not.
1028
1029*/
1030
1031PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
1032
1033/* --- Free-list management ----------------------------------------------- */
1034
1035/* Clear the free list used by the Unicode implementation.
1036
1037   This can be used to release memory used for objects on the free
1038   list back to the Python memory allocator.
1039
1040*/
1041
1042PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
1043
1044/* === Builtin Codecs =====================================================
1045
1046   Many of these APIs take two arguments encoding and errors. These
1047   parameters encoding and errors have the same semantics as the ones
1048   of the builtin str() API.
1049
1050   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
1051
1052   Error handling is set by errors which may also be set to NULL
1053   meaning to use the default handling defined for the codec. Default
1054   error handling for all builtin codecs is "strict" (ValueErrors are
1055   raised).
1056
1057   The codecs all use a similar interface. Only deviation from the
1058   generic ones are documented.
1059
1060*/
1061
1062/* --- Manage the default encoding ---------------------------------------- */
1063
1064/* Returns a pointer to the default encoding (UTF-8) of the
1065   Unicode object unicode and the size of the encoded representation
1066   in bytes stored in *size.
1067
1068   In case of an error, no *size is set.
1069
1070   This function caches the UTF-8 encoded string in the unicodeobject
1071   and subsequent calls will return the same string.  The memory is released
1072   when the unicodeobject is deallocated.
1073
1074   _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
1075   support the previous internal function with the same behaviour.
1076
1077   *** This API is for interpreter INTERNAL USE ONLY and will likely
1078   *** be removed or changed in the future.
1079
1080   *** If you need to access the Unicode object as UTF-8 bytes string,
1081   *** please use PyUnicode_AsUTF8String() instead.
1082*/
1083
1084#ifndef Py_LIMITED_API
1085PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
1086    PyObject *unicode,
1087    Py_ssize_t *size);
1088#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
1089#endif
1090
1091/* Returns a pointer to the default encoding (UTF-8) of the
1092   Unicode object unicode.
1093
1094   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1095   in the unicodeobject.
1096
1097   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1098   support the previous internal function with the same behaviour.
1099
1100   Use of this API is DEPRECATED since no size information can be
1101   extracted from the returned data.
1102
1103   *** This API is for interpreter INTERNAL USE ONLY and will likely
1104   *** be removed or changed for Python 3.1.
1105
1106   *** If you need to access the Unicode object as UTF-8 bytes string,
1107   *** please use PyUnicode_AsUTF8String() instead.
1108
1109*/
1110
1111#ifndef Py_LIMITED_API
1112PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1113#define _PyUnicode_AsString PyUnicode_AsUTF8
1114#endif
1115
1116/* Returns "utf-8".  */
1117
1118PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
1119
1120/* --- Generic Codecs ----------------------------------------------------- */
1121
1122/* Create a Unicode object by decoding the encoded string s of the
1123   given size. */
1124
1125PyAPI_FUNC(PyObject*) PyUnicode_Decode(
1126    const char *s,              /* encoded string */
1127    Py_ssize_t size,            /* size of buffer */
1128    const char *encoding,       /* encoding */
1129    const char *errors          /* error handling */
1130    );
1131
1132/* Decode a Unicode object unicode and return the result as Python
1133   object. */
1134
1135PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
1136    PyObject *unicode,          /* Unicode object */
1137    const char *encoding,       /* encoding */
1138    const char *errors          /* error handling */
1139    );
1140
1141/* Decode a Unicode object unicode and return the result as Unicode
1142   object. */
1143
1144PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
1145    PyObject *unicode,          /* Unicode object */
1146    const char *encoding,       /* encoding */
1147    const char *errors          /* error handling */
1148    );
1149
1150/* Encodes a Py_UNICODE buffer of the given size and returns a
1151   Python string object. */
1152
1153#ifndef Py_LIMITED_API
1154PyAPI_FUNC(PyObject*) PyUnicode_Encode(
1155    const Py_UNICODE *s,        /* Unicode char buffer */
1156    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
1157    const char *encoding,       /* encoding */
1158    const char *errors          /* error handling */
1159    );
1160#endif
1161
1162/* Encodes a Unicode object and returns the result as Python
1163   object. */
1164
1165PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
1166    PyObject *unicode,          /* Unicode object */
1167    const char *encoding,       /* encoding */
1168    const char *errors          /* error handling */
1169    );
1170
1171/* Encodes a Unicode object and returns the result as Python string
1172   object. */
1173
1174PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
1175    PyObject *unicode,          /* Unicode object */
1176    const char *encoding,       /* encoding */
1177    const char *errors          /* error handling */
1178    );
1179
1180/* Encodes a Unicode object and returns the result as Unicode
1181   object. */
1182
1183PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
1184    PyObject *unicode,          /* Unicode object */
1185    const char *encoding,       /* encoding */
1186    const char *errors          /* error handling */
1187    );
1188
1189/* Build an encoding map. */
1190
1191PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1192    PyObject* string            /* 256 character map */
1193   );
1194
1195/* --- UTF-7 Codecs ------------------------------------------------------- */
1196
1197PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
1198    const char *string,         /* UTF-7 encoded string */
1199    Py_ssize_t length,          /* size of string */
1200    const char *errors          /* error handling */
1201    );
1202
1203PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
1204    const char *string,         /* UTF-7 encoded string */
1205    Py_ssize_t length,          /* size of string */
1206    const char *errors,         /* error handling */
1207    Py_ssize_t *consumed        /* bytes consumed */
1208    );
1209
1210#ifndef Py_LIMITED_API
1211PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
1212    const Py_UNICODE *data,     /* Unicode char buffer */
1213    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1214    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1215    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1216    const char *errors          /* error handling */
1217    );
1218PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1219    PyObject *unicode,          /* Unicode object */
1220    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1221    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1222    const char *errors          /* error handling */
1223    );
1224#endif
1225
1226/* --- UTF-8 Codecs ------------------------------------------------------- */
1227
1228PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
1229    const char *string,         /* UTF-8 encoded string */
1230    Py_ssize_t length,          /* size of string */
1231    const char *errors          /* error handling */
1232    );
1233
1234PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
1235    const char *string,         /* UTF-8 encoded string */
1236    Py_ssize_t length,          /* size of string */
1237    const char *errors,         /* error handling */
1238    Py_ssize_t *consumed        /* bytes consumed */
1239    );
1240
1241PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
1242    PyObject *unicode           /* Unicode object */
1243    );
1244
1245#ifndef Py_LIMITED_API
1246PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1247    PyObject *unicode,
1248    const char *errors);
1249
1250PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
1251    const Py_UNICODE *data,     /* Unicode char buffer */
1252    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1253    const char *errors          /* error handling */
1254    );
1255#endif
1256
1257/* --- UTF-32 Codecs ------------------------------------------------------ */
1258
1259/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1260   the corresponding Unicode object.
1261
1262   errors (if non-NULL) defines the error handling. It defaults
1263   to "strict".
1264
1265   If byteorder is non-NULL, the decoder starts decoding using the
1266   given byte order:
1267
1268    *byteorder == -1: little endian
1269    *byteorder == 0:  native order
1270    *byteorder == 1:  big endian
1271
1272   In native mode, the first four bytes of the stream are checked for a
1273   BOM mark. If found, the BOM mark is analysed, the byte order
1274   adjusted and the BOM skipped.  In the other modes, no BOM mark
1275   interpretation is done. After completion, *byteorder is set to the
1276   current byte order at the end of input data.
1277
1278   If byteorder is NULL, the codec starts in native order mode.
1279
1280*/
1281
1282PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
1283    const char *string,         /* UTF-32 encoded string */
1284    Py_ssize_t length,          /* size of string */
1285    const char *errors,         /* error handling */
1286    int *byteorder              /* pointer to byteorder to use
1287                                   0=native;-1=LE,1=BE; updated on
1288                                   exit */
1289    );
1290
1291PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
1292    const char *string,         /* UTF-32 encoded string */
1293    Py_ssize_t length,          /* size of string */
1294    const char *errors,         /* error handling */
1295    int *byteorder,             /* pointer to byteorder to use
1296                                   0=native;-1=LE,1=BE; updated on
1297                                   exit */
1298    Py_ssize_t *consumed        /* bytes consumed */
1299    );
1300
1301/* Returns a Python string using the UTF-32 encoding in native byte
1302   order. The string always starts with a BOM mark.  */
1303
1304PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
1305    PyObject *unicode           /* Unicode object */
1306    );
1307
1308/* Returns a Python string object holding the UTF-32 encoded value of
1309   the Unicode data.
1310
1311   If byteorder is not 0, output is written according to the following
1312   byte order:
1313
1314   byteorder == -1: little endian
1315   byteorder == 0:  native byte order (writes a BOM mark)
1316   byteorder == 1:  big endian
1317
1318   If byteorder is 0, the output string will always start with the
1319   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1320   prepended.
1321
1322*/
1323
1324#ifndef Py_LIMITED_API
1325PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
1326    const Py_UNICODE *data,     /* Unicode char buffer */
1327    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1328    const char *errors,         /* error handling */
1329    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1330    );
1331PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1332    PyObject *object,           /* Unicode object */
1333    const char *errors,         /* error handling */
1334    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1335    );
1336#endif
1337
1338/* --- UTF-16 Codecs ------------------------------------------------------ */
1339
1340/* Decodes length bytes from a UTF-16 encoded buffer string and returns
1341   the corresponding Unicode object.
1342
1343   errors (if non-NULL) defines the error handling. It defaults
1344   to "strict".
1345
1346   If byteorder is non-NULL, the decoder starts decoding using the
1347   given byte order:
1348
1349    *byteorder == -1: little endian
1350    *byteorder == 0:  native order
1351    *byteorder == 1:  big endian
1352
1353   In native mode, the first two bytes of the stream are checked for a
1354   BOM mark. If found, the BOM mark is analysed, the byte order
1355   adjusted and the BOM skipped.  In the other modes, no BOM mark
1356   interpretation is done. After completion, *byteorder is set to the
1357   current byte order at the end of input data.
1358
1359   If byteorder is NULL, the codec starts in native order mode.
1360
1361*/
1362
1363PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
1364    const char *string,         /* UTF-16 encoded string */
1365    Py_ssize_t length,          /* size of string */
1366    const char *errors,         /* error handling */
1367    int *byteorder              /* pointer to byteorder to use
1368                                   0=native;-1=LE,1=BE; updated on
1369                                   exit */
1370    );
1371
1372PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
1373    const char *string,         /* UTF-16 encoded string */
1374    Py_ssize_t length,          /* size of string */
1375    const char *errors,         /* error handling */
1376    int *byteorder,             /* pointer to byteorder to use
1377                                   0=native;-1=LE,1=BE; updated on
1378                                   exit */
1379    Py_ssize_t *consumed        /* bytes consumed */
1380    );
1381
1382/* Returns a Python string using the UTF-16 encoding in native byte
1383   order. The string always starts with a BOM mark.  */
1384
1385PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
1386    PyObject *unicode           /* Unicode object */
1387    );
1388
1389/* Returns a Python string object holding the UTF-16 encoded value of
1390   the Unicode data.
1391
1392   If byteorder is not 0, output is written according to the following
1393   byte order:
1394
1395   byteorder == -1: little endian
1396   byteorder == 0:  native byte order (writes a BOM mark)
1397   byteorder == 1:  big endian
1398
1399   If byteorder is 0, the output string will always start with the
1400   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1401   prepended.
1402
1403   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1404   UCS-2. This trick makes it possible to add full UTF-16 capabilities
1405   at a later point without compromising the APIs.
1406
1407*/
1408
1409#ifndef Py_LIMITED_API
1410PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
1411    const Py_UNICODE *data,     /* Unicode char buffer */
1412    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1413    const char *errors,         /* error handling */
1414    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1415    );
1416PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1417    PyObject* unicode,          /* Unicode object */
1418    const char *errors,         /* error handling */
1419    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1420    );
1421#endif
1422
1423/* --- Unicode-Escape Codecs ---------------------------------------------- */
1424
1425PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
1426    const char *string,         /* Unicode-Escape encoded string */
1427    Py_ssize_t length,          /* size of string */
1428    const char *errors          /* error handling */
1429    );
1430
1431PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
1432    PyObject *unicode           /* Unicode object */
1433    );
1434
1435#ifndef Py_LIMITED_API
1436PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
1437    const Py_UNICODE *data,     /* Unicode char buffer */
1438    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1439    );
1440#endif
1441
1442/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1443
1444PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
1445    const char *string,         /* Raw-Unicode-Escape encoded string */
1446    Py_ssize_t length,          /* size of string */
1447    const char *errors          /* error handling */
1448    );
1449
1450PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
1451    PyObject *unicode           /* Unicode object */
1452    );
1453
1454#ifndef Py_LIMITED_API
1455PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
1456    const Py_UNICODE *data,     /* Unicode char buffer */
1457    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1458    );
1459#endif
1460
1461/* --- Unicode Internal Codec ---------------------------------------------
1462
1463    Only for internal use in _codecsmodule.c */
1464
1465#ifndef Py_LIMITED_API
1466PyObject *_PyUnicode_DecodeUnicodeInternal(
1467    const char *string,
1468    Py_ssize_t length,
1469    const char *errors
1470    );
1471#endif
1472
1473/* --- Latin-1 Codecs -----------------------------------------------------
1474
1475   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1476
1477*/
1478
1479PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1480    const char *string,         /* Latin-1 encoded string */
1481    Py_ssize_t length,          /* size of string */
1482    const char *errors          /* error handling */
1483    );
1484
1485PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1486    PyObject *unicode           /* Unicode object */
1487    );
1488
1489#ifndef Py_LIMITED_API
1490PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1491    PyObject* unicode,
1492    const char* errors);
1493
1494PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1495    const Py_UNICODE *data,     /* Unicode char buffer */
1496    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1497    const char *errors          /* error handling */
1498    );
1499#endif
1500
1501/* --- ASCII Codecs -------------------------------------------------------
1502
1503   Only 7-bit ASCII data is excepted. All other codes generate errors.
1504
1505*/
1506
1507PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1508    const char *string,         /* ASCII encoded string */
1509    Py_ssize_t length,          /* size of string */
1510    const char *errors          /* error handling */
1511    );
1512
1513PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1514    PyObject *unicode           /* Unicode object */
1515    );
1516
1517#ifndef Py_LIMITED_API
1518PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1519    PyObject* unicode,
1520    const char* errors);
1521
1522PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1523    const Py_UNICODE *data,     /* Unicode char buffer */
1524    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1525    const char *errors          /* error handling */
1526    );
1527#endif
1528
1529/* --- Character Map Codecs -----------------------------------------------
1530
1531   This codec uses mappings to encode and decode characters.
1532
1533   Decoding mappings must map single string characters to single
1534   Unicode characters, integers (which are then interpreted as Unicode
1535   ordinals) or None (meaning "undefined mapping" and causing an
1536   error).
1537
1538   Encoding mappings must map single Unicode characters to single
1539   string characters, integers (which are then interpreted as Latin-1
1540   ordinals) or None (meaning "undefined mapping" and causing an
1541   error).
1542
1543   If a character lookup fails with a LookupError, the character is
1544   copied as-is meaning that its ordinal value will be interpreted as
1545   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1546   to contain those mappings which map characters to different code
1547   points.
1548
1549*/
1550
1551PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1552    const char *string,         /* Encoded string */
1553    Py_ssize_t length,          /* size of string */
1554    PyObject *mapping,          /* character mapping
1555                                   (char ordinal -> unicode ordinal) */
1556    const char *errors          /* error handling */
1557    );
1558
1559PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1560    PyObject *unicode,          /* Unicode object */
1561    PyObject *mapping           /* character mapping
1562                                   (unicode ordinal -> char ordinal) */
1563    );
1564
1565#ifndef Py_LIMITED_API
1566PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1567    const Py_UNICODE *data,     /* Unicode char buffer */
1568    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1569    PyObject *mapping,          /* character mapping
1570                                   (unicode ordinal -> char ordinal) */
1571    const char *errors          /* error handling */
1572    );
1573PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1574    PyObject *unicode,          /* Unicode object */
1575    PyObject *mapping,          /* character mapping
1576                                   (unicode ordinal -> char ordinal) */
1577    const char *errors          /* error handling */
1578    );
1579#endif
1580
1581/* Translate a Py_UNICODE buffer of the given length by applying a
1582   character mapping table to it and return the resulting Unicode
1583   object.
1584
1585   The mapping table must map Unicode ordinal integers to Unicode
1586   ordinal integers or None (causing deletion of the character).
1587
1588   Mapping tables may be dictionaries or sequences. Unmapped character
1589   ordinals (ones which cause a LookupError) are left untouched and
1590   are copied as-is.
1591
1592*/
1593
1594#ifndef Py_LIMITED_API
1595PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1596    const Py_UNICODE *data,     /* Unicode char buffer */
1597    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1598    PyObject *table,            /* Translate table */
1599    const char *errors          /* error handling */
1600    );
1601#endif
1602
1603#ifdef HAVE_MBCS
1604
1605/* --- MBCS codecs for Windows -------------------------------------------- */
1606
1607PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1608    const char *string,         /* MBCS encoded string */
1609    Py_ssize_t length,              /* size of string */
1610    const char *errors          /* error handling */
1611    );
1612
1613PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1614    const char *string,         /* MBCS encoded string */
1615    Py_ssize_t length,          /* size of string */
1616    const char *errors,         /* error handling */
1617    Py_ssize_t *consumed        /* bytes consumed */
1618    );
1619
1620PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1621    int code_page,              /* code page number */
1622    const char *string,         /* encoded string */
1623    Py_ssize_t length,          /* size of string */
1624    const char *errors,         /* error handling */
1625    Py_ssize_t *consumed        /* bytes consumed */
1626    );
1627
1628PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1629    PyObject *unicode           /* Unicode object */
1630    );
1631
1632#ifndef Py_LIMITED_API
1633PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1634    const Py_UNICODE *data,     /* Unicode char buffer */
1635    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1636    const char *errors          /* error handling */
1637    );
1638#endif
1639
1640PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1641    int code_page,              /* code page number */
1642    PyObject *unicode,          /* Unicode object */
1643    const char *errors          /* error handling */
1644    );
1645
1646#endif /* HAVE_MBCS */
1647
1648/* --- Decimal Encoder ---------------------------------------------------- */
1649
1650/* Takes a Unicode string holding a decimal value and writes it into
1651   an output buffer using standard ASCII digit codes.
1652
1653   The output buffer has to provide at least length+1 bytes of storage
1654   area. The output string is 0-terminated.
1655
1656   The encoder converts whitespace to ' ', decimal characters to their
1657   corresponding ASCII digit and all other Latin-1 characters except
1658   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1659   are treated as errors. This includes embedded NULL bytes.
1660
1661   Error handling is defined by the errors argument:
1662
1663      NULL or "strict": raise a ValueError
1664      "ignore": ignore the wrong characters (these are not copied to the
1665                output buffer)
1666      "replace": replaces illegal characters with '?'
1667
1668   Returns 0 on success, -1 on failure.
1669
1670*/
1671
1672#ifndef Py_LIMITED_API
1673PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1674    Py_UNICODE *s,              /* Unicode buffer */
1675    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1676    char *output,               /* Output buffer; must have size >= length */
1677    const char *errors          /* error handling */
1678    );
1679#endif
1680
1681/* Transforms code points that have decimal digit property to the
1682   corresponding ASCII digit code points.
1683
1684   Returns a new Unicode string on success, NULL on failure.
1685*/
1686
1687#ifndef Py_LIMITED_API
1688PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1689    Py_UNICODE *s,              /* Unicode buffer */
1690    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
1691    );
1692#endif
1693
1694/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
1695   as argument instead of a raw buffer and length.  This function additionally
1696   transforms spaces to ASCII because this is what the callers in longobject,
1697   floatobject, and complexobject did anyways. */
1698
1699#ifndef Py_LIMITED_API
1700PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1701    PyObject *unicode           /* Unicode object */
1702    );
1703#endif
1704
1705/* --- Locale encoding --------------------------------------------------- */
1706
1707/* Decode a string from the current locale encoding. The decoder is strict if
1708   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1709   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1710   be decoded as a surrogate character and *surrogateescape* is not equal to
1711   zero, the byte sequence is escaped using the 'surrogateescape' error handler
1712   instead of being decoded. *str* must end with a null character but cannot
1713   contain embedded null characters. */
1714
1715PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1716    const char *str,
1717    Py_ssize_t len,
1718    const char *errors);
1719
1720/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1721   length using strlen(). */
1722
1723PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1724    const char *str,
1725    const char *errors);
1726
1727/* Encode a Unicode object to the current locale encoding. The encoder is
1728   strict is *surrogateescape* is equal to zero, otherwise the
1729   "surrogateescape" error handler is used. Return a bytes object. The string
1730   cannot contain embedded null characters.. */
1731
1732PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1733    PyObject *unicode,
1734    const char *errors
1735    );
1736
1737/* --- File system encoding ---------------------------------------------- */
1738
1739/* ParseTuple converter: encode str objects to bytes using
1740   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1741
1742PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1743
1744/* ParseTuple converter: decode bytes objects to unicode using
1745   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1746
1747PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1748
1749/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1750   and the "surrogateescape" error handler.
1751
1752   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1753   encoding.
1754
1755   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1756*/
1757
1758PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1759    const char *s               /* encoded string */
1760    );
1761
1762/* Decode a string using Py_FileSystemDefaultEncoding
1763   and the "surrogateescape" error handler.
1764
1765   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1766   encoding.
1767*/
1768
1769PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1770    const char *s,               /* encoded string */
1771    Py_ssize_t size              /* size */
1772    );
1773
1774/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1775   "surrogateescape" error handler, and return bytes.
1776
1777   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1778   encoding.
1779*/
1780
1781PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1782    PyObject *unicode
1783    );
1784
1785/* --- Methods & Slots ----------------------------------------------------
1786
1787   These are capable of handling Unicode objects and strings on input
1788   (we refer to them as strings in the descriptions) and return
1789   Unicode objects or integers as appropriate. */
1790
1791/* Concat two strings giving a new Unicode string. */
1792
1793PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1794    PyObject *left,             /* Left string */
1795    PyObject *right             /* Right string */
1796    );
1797
1798/* Concat two strings and put the result in *pleft
1799   (sets *pleft to NULL on error) */
1800
1801PyAPI_FUNC(void) PyUnicode_Append(
1802    PyObject **pleft,           /* Pointer to left string */
1803    PyObject *right             /* Right string */
1804    );
1805
1806/* Concat two strings, put the result in *pleft and drop the right object
1807   (sets *pleft to NULL on error) */
1808
1809PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1810    PyObject **pleft,           /* Pointer to left string */
1811    PyObject *right             /* Right string */
1812    );
1813
1814/* Split a string giving a list of Unicode strings.
1815
1816   If sep is NULL, splitting will be done at all whitespace
1817   substrings. Otherwise, splits occur at the given separator.
1818
1819   At most maxsplit splits will be done. If negative, no limit is set.
1820
1821   Separators are not included in the resulting list.
1822
1823*/
1824
1825PyAPI_FUNC(PyObject*) PyUnicode_Split(
1826    PyObject *s,                /* String to split */
1827    PyObject *sep,              /* String separator */
1828    Py_ssize_t maxsplit         /* Maxsplit count */
1829    );
1830
1831/* Dito, but split at line breaks.
1832
1833   CRLF is considered to be one line break. Line breaks are not
1834   included in the resulting list. */
1835
1836PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1837    PyObject *s,                /* String to split */
1838    int keepends                /* If true, line end markers are included */
1839    );
1840
1841/* Partition a string using a given separator. */
1842
1843PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1844    PyObject *s,                /* String to partition */
1845    PyObject *sep               /* String separator */
1846    );
1847
1848/* Partition a string using a given separator, searching from the end of the
1849   string. */
1850
1851PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1852    PyObject *s,                /* String to partition */
1853    PyObject *sep               /* String separator */
1854    );
1855
1856/* Split a string giving a list of Unicode strings.
1857
1858   If sep is NULL, splitting will be done at all whitespace
1859   substrings. Otherwise, splits occur at the given separator.
1860
1861   At most maxsplit splits will be done. But unlike PyUnicode_Split
1862   PyUnicode_RSplit splits from the end of the string. If negative,
1863   no limit is set.
1864
1865   Separators are not included in the resulting list.
1866
1867*/
1868
1869PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1870    PyObject *s,                /* String to split */
1871    PyObject *sep,              /* String separator */
1872    Py_ssize_t maxsplit         /* Maxsplit count */
1873    );
1874
1875/* Translate a string by applying a character mapping table to it and
1876   return the resulting Unicode object.
1877
1878   The mapping table must map Unicode ordinal integers to Unicode
1879   ordinal integers or None (causing deletion of the character).
1880
1881   Mapping tables may be dictionaries or sequences. Unmapped character
1882   ordinals (ones which cause a LookupError) are left untouched and
1883   are copied as-is.
1884
1885*/
1886
1887PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1888    PyObject *str,              /* String */
1889    PyObject *table,            /* Translate table */
1890    const char *errors          /* error handling */
1891    );
1892
1893/* Join a sequence of strings using the given separator and return
1894   the resulting Unicode string. */
1895
1896PyAPI_FUNC(PyObject*) PyUnicode_Join(
1897    PyObject *separator,        /* Separator string */
1898    PyObject *seq               /* Sequence object */
1899    );
1900
1901/* Return 1 if substr matches str[start:end] at the given tail end, 0
1902   otherwise. */
1903
1904PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1905    PyObject *str,              /* String */
1906    PyObject *substr,           /* Prefix or Suffix string */
1907    Py_ssize_t start,           /* Start index */
1908    Py_ssize_t end,             /* Stop index */
1909    int direction               /* Tail end: -1 prefix, +1 suffix */
1910    );
1911
1912/* Return the first position of substr in str[start:end] using the
1913   given search direction or -1 if not found. -2 is returned in case
1914   an error occurred and an exception is set. */
1915
1916PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1917    PyObject *str,              /* String */
1918    PyObject *substr,           /* Substring to find */
1919    Py_ssize_t start,           /* Start index */
1920    Py_ssize_t end,             /* Stop index */
1921    int direction               /* Find direction: +1 forward, -1 backward */
1922    );
1923
1924/* Like PyUnicode_Find, but search for single character only. */
1925PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1926    PyObject *str,
1927    Py_UCS4 ch,
1928    Py_ssize_t start,
1929    Py_ssize_t end,
1930    int direction
1931    );
1932
1933/* Count the number of occurrences of substr in str[start:end]. */
1934
1935PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1936    PyObject *str,              /* String */
1937    PyObject *substr,           /* Substring to count */
1938    Py_ssize_t start,           /* Start index */
1939    Py_ssize_t end              /* Stop index */
1940    );
1941
1942/* Replace at most maxcount occurrences of substr in str with replstr
1943   and return the resulting Unicode object. */
1944
1945PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1946    PyObject *str,              /* String */
1947    PyObject *substr,           /* Substring to find */
1948    PyObject *replstr,          /* Substring to replace */
1949    Py_ssize_t maxcount         /* Max. number of replacements to apply;
1950                                   -1 = all */
1951    );
1952
1953/* Compare two strings and return -1, 0, 1 for less than, equal,
1954   greater than resp.
1955   Raise an exception and return -1 on error. */
1956
1957PyAPI_FUNC(int) PyUnicode_Compare(
1958    PyObject *left,             /* Left string */
1959    PyObject *right             /* Right string */
1960    );
1961
1962PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1963    PyObject *left,
1964    const char *right           /* ASCII-encoded string */
1965    );
1966
1967/* Rich compare two strings and return one of the following:
1968
1969   - NULL in case an exception was raised
1970   - Py_True or Py_False for successfully comparisons
1971   - Py_NotImplemented in case the type combination is unknown
1972
1973   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1974   case the conversion of the arguments to Unicode fails with a
1975   UnicodeDecodeError.
1976
1977   Possible values for op:
1978
1979     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1980
1981*/
1982
1983PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1984    PyObject *left,             /* Left string */
1985    PyObject *right,            /* Right string */
1986    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1987    );
1988
1989/* Apply a argument tuple or dictionary to a format string and return
1990   the resulting Unicode string. */
1991
1992PyAPI_FUNC(PyObject *) PyUnicode_Format(
1993    PyObject *format,           /* Format string */
1994    PyObject *args              /* Argument tuple or dictionary */
1995    );
1996
1997/* Checks whether element is contained in container and return 1/0
1998   accordingly.
1999
2000   element has to coerce to an one element Unicode string. -1 is
2001   returned in case of an error. */
2002
2003PyAPI_FUNC(int) PyUnicode_Contains(
2004    PyObject *container,        /* Container string */
2005    PyObject *element           /* Element string */
2006    );
2007
2008/* Checks whether the string contains any NUL characters. */
2009
2010#ifndef Py_LIMITED_API
2011PyAPI_FUNC(int) _PyUnicode_HasNULChars(PyObject *);
2012#endif
2013
2014/* Checks whether argument is a valid identifier. */
2015
2016PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
2017
2018#ifndef Py_LIMITED_API
2019/* Externally visible for str.strip(unicode) */
2020PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
2021    PyObject *self,
2022    int striptype,
2023    PyObject *sepobj
2024    );
2025#endif
2026
2027/* Using explicit passed-in values, insert the thousands grouping
2028   into the string pointed to by buffer.  For the argument descriptions,
2029   see Objects/stringlib/localeutil.h */
2030#ifndef Py_LIMITED_API
2031PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
2032    PyObject *unicode,
2033    Py_ssize_t index,
2034    Py_ssize_t n_buffer,
2035    void *digits,
2036    Py_ssize_t n_digits,
2037    Py_ssize_t min_width,
2038    const char *grouping,
2039    PyObject *thousands_sep,
2040    Py_UCS4 *maxchar);
2041#endif
2042/* === Characters Type APIs =============================================== */
2043
2044/* Helper array used by Py_UNICODE_ISSPACE(). */
2045
2046#ifndef Py_LIMITED_API
2047PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
2048
2049/* These should not be used directly. Use the Py_UNICODE_IS* and
2050   Py_UNICODE_TO* macros instead.
2051
2052   These APIs are implemented in Objects/unicodectype.c.
2053
2054*/
2055
2056PyAPI_FUNC(int) _PyUnicode_IsLowercase(
2057    Py_UCS4 ch       /* Unicode character */
2058    );
2059
2060PyAPI_FUNC(int) _PyUnicode_IsUppercase(
2061    Py_UCS4 ch       /* Unicode character */
2062    );
2063
2064PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
2065    Py_UCS4 ch       /* Unicode character */
2066    );
2067
2068PyAPI_FUNC(int) _PyUnicode_IsXidStart(
2069    Py_UCS4 ch       /* Unicode character */
2070    );
2071
2072PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
2073    Py_UCS4 ch       /* Unicode character */
2074    );
2075
2076PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
2077    const Py_UCS4 ch         /* Unicode character */
2078    );
2079
2080PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
2081    const Py_UCS4 ch         /* Unicode character */
2082    );
2083
2084PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2085    Py_UCS4 ch       /* Unicode character */
2086    );
2087
2088PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2089    Py_UCS4 ch       /* Unicode character */
2090    );
2091
2092PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2093    Py_UCS4 ch       /* Unicode character */
2094    );
2095
2096PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2097    Py_UCS4 ch,       /* Unicode character */
2098    Py_UCS4 *res
2099    );
2100
2101PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2102    Py_UCS4 ch,       /* Unicode character */
2103    Py_UCS4 *res
2104    );
2105
2106PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2107    Py_UCS4 ch,       /* Unicode character */
2108    Py_UCS4 *res
2109    );
2110
2111PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2112    Py_UCS4 ch,       /* Unicode character */
2113    Py_UCS4 *res
2114    );
2115
2116PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
2117    Py_UCS4 ch         /* Unicode character */
2118    );
2119
2120PyAPI_FUNC(int) _PyUnicode_IsCased(
2121    Py_UCS4 ch         /* Unicode character */
2122    );
2123
2124PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
2125    Py_UCS4 ch       /* Unicode character */
2126    );
2127
2128PyAPI_FUNC(int) _PyUnicode_ToDigit(
2129    Py_UCS4 ch       /* Unicode character */
2130    );
2131
2132PyAPI_FUNC(double) _PyUnicode_ToNumeric(
2133    Py_UCS4 ch       /* Unicode character */
2134    );
2135
2136PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
2137    Py_UCS4 ch       /* Unicode character */
2138    );
2139
2140PyAPI_FUNC(int) _PyUnicode_IsDigit(
2141    Py_UCS4 ch       /* Unicode character */
2142    );
2143
2144PyAPI_FUNC(int) _PyUnicode_IsNumeric(
2145    Py_UCS4 ch       /* Unicode character */
2146    );
2147
2148PyAPI_FUNC(int) _PyUnicode_IsPrintable(
2149    Py_UCS4 ch       /* Unicode character */
2150    );
2151
2152PyAPI_FUNC(int) _PyUnicode_IsAlpha(
2153    Py_UCS4 ch       /* Unicode character */
2154    );
2155
2156PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2157    const Py_UNICODE *u
2158    );
2159
2160PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
2161    Py_UNICODE *s1,
2162    const Py_UNICODE *s2);
2163
2164PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2165    Py_UNICODE *s1, const Py_UNICODE *s2);
2166
2167PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
2168    Py_UNICODE *s1,
2169    const Py_UNICODE *s2,
2170    size_t n);
2171
2172PyAPI_FUNC(int) Py_UNICODE_strcmp(
2173    const Py_UNICODE *s1,
2174    const Py_UNICODE *s2
2175    );
2176
2177PyAPI_FUNC(int) Py_UNICODE_strncmp(
2178    const Py_UNICODE *s1,
2179    const Py_UNICODE *s2,
2180    size_t n
2181    );
2182
2183PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
2184    const Py_UNICODE *s,
2185    Py_UNICODE c
2186    );
2187
2188PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
2189    const Py_UNICODE *s,
2190    Py_UNICODE c
2191    );
2192
2193/* Create a copy of a unicode string ending with a nul character. Return NULL
2194   and raise a MemoryError exception on memory allocation failure, otherwise
2195   return a new allocated buffer (use PyMem_Free() to free the buffer). */
2196
2197PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
2198    PyObject *unicode
2199    );
2200#endif /* Py_LIMITED_API */
2201
2202#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
2203PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
2204    PyObject *op,
2205    int check_content);
2206#endif
2207
2208/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2209PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2210/* Clear all static strings. */
2211PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2212
2213#ifdef __cplusplus
2214}
2215#endif
2216#endif /* !Py_UNICODEOBJECT_H */
2217