unicodeobject.h revision a3b334da6dd0477e5bf144934d184bc0b3e3779b
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
12
13Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python.  This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58#include <ctype.h>
59
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
64/* Python 3.x requires unicode */
65#define Py_USING_UNICODE
66
67#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
69#endif
70
71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74   Otherwise, Unicode strings are stored as UCS-2 (with limited support
75   for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Py_UNICODE was the native Unicode storage format (code unit) used by
87   Python and represents a single Unicode element in the Unicode type.
88   With PEP 393, Py_UNICODE is deprected and replaced with a
89   typedef to wchar_t. */
90
91#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97   through the interface functions PyUnicode_FromWideChar(),
98   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
101# ifndef HAVE_WCHAR_H
102#  define HAVE_WCHAR_H
103# endif
104#endif
105
106#if defined(MS_WINDOWS)
107#  define HAVE_MBCS
108#endif
109
110#ifdef HAVE_WCHAR_H
111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113#  include <time.h>
114# endif
115#  include <wchar.h>
116#endif
117
118/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
119   unicode representations. */
120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
122#elif SIZEOF_LONG >= 4
123typedef unsigned long Py_UCS4;
124#else
125#error "Could not find a proper typedef for Py_UCS4"
126#endif
127
128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
131/* --- Internal Unicode Operations ---------------------------------------- */
132
133/* Since splitting on whitespace is an important use case, and
134   whitespace in most situations is solely ASCII whitespace, we
135   optimize for the common case by using a quick look-up table
136   _Py_ascii_whitespace (see below) with an inlined check.
137
138 */
139#ifndef Py_LIMITED_API
140#define Py_UNICODE_ISSPACE(ch) \
141    ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
162
163#define Py_UNICODE_ISALNUM(ch) \
164       (Py_UNICODE_ISALPHA(ch) || \
165    Py_UNICODE_ISDECIMAL(ch) || \
166    Py_UNICODE_ISDIGIT(ch) || \
167    Py_UNICODE_ISNUMERIC(ch))
168
169#define Py_UNICODE_COPY(target, source, length) \
170    Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
171
172#define Py_UNICODE_FILL(target, value, length) \
173    do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
174    for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
175    } while (0)
176
177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low)  \
183    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
184      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
186/* Check if substring matches at given offset.  The offset must be
187   valid, and the substring must not be empty. */
188
189#define Py_UNICODE_MATCH(string, offset, substring) \
190    ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191     ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192     !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
194#endif /* Py_LIMITED_API */
195
196#ifdef __cplusplus
197extern "C" {
198#endif
199
200/* --- Unicode Type ------------------------------------------------------- */
201
202#ifndef Py_LIMITED_API
203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205   structure. state.ascii and state.compact are set, and the data
206   immediately follow the structure. utf8_length and wstr_length can be found
207   in the length field; the utf8 pointer is equal to the data pointer. */
208typedef struct {
209    /* Unicode strings can be in 4 states:
210
211       - compact ascii:
212
213         * structure = PyASCIIObject
214         * kind = PyUnicode_1BYTE_KIND
215         * compact = 1
216         * ascii = 1
217         * ready = 1
218         * utf8 = data
219
220       - compact:
221
222         * structure = PyCompactUnicodeObject
223         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
224           PyUnicode_4BYTE_KIND
225         * compact = 1
226         * ready = 1
227         * ascii = 0
228
229       - string created by the legacy API (not ready):
230
231         * structure = PyUnicodeObject
232         * kind = PyUnicode_WCHAR_KIND
233         * compact = 0
234         * ready = 0
235         * wstr is not NULL
236         * data.any is NULL
237         * utf8 is NULL
238         * interned = SSTATE_NOT_INTERNED
239         * ascii = 0
240
241       - string created by the legacy API, ready:
242
243         * structure = PyUnicodeObject structure
244         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
245           PyUnicode_4BYTE_KIND
246         * compact = 0
247         * ready = 1
248         * data.any is not NULL
249
250       String created by the legacy API becomes ready when calling
251       PyUnicode_READY().
252
253       See also _PyUnicode_CheckConsistency(). */
254    PyObject_HEAD
255    Py_ssize_t length;          /* Number of code points in the string */
256    Py_hash_t hash;             /* Hash value; -1 if not set */
257    struct {
258        /*
259           SSTATE_NOT_INTERNED (0)
260           SSTATE_INTERNED_MORTAL (1)
261           SSTATE_INTERNED_IMMORTAL (2)
262
263           If interned != SSTATE_NOT_INTERNED, the two references from the
264           dictionary to this object are *not* counted in ob_refcnt.
265         */
266        unsigned int interned:2;
267        /* Character size:
268
269           PyUnicode_WCHAR_KIND (0): wchar_t*
270           PyUnicode_1BYTE_KIND (1): Py_UCS1*
271           PyUnicode_2BYTE_KIND (2): Py_UCS2*
272           PyUnicode_4BYTE_KIND (3): Py_UCS4*
273         */
274        unsigned int kind:2;
275        /* Compact is with respect to the allocation scheme. Compact unicode
276           objects only require one memory block while non-compact objects use
277           one block for the PyUnicodeObject struct and another for its data
278           buffer. */
279        unsigned int compact:1;
280        /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII
281           characters. If ascii is 1 and compact is 1, use the PyASCIIObject
282           structure. */
283        unsigned int ascii:1;
284        /* The ready flag indicates whether the object layout is initialized
285           completely. This means that this is either a compact object, or
286           the data pointer is filled out. The bit is redundant, and helps
287           to minimize the test in PyUnicode_IS_READY(). */
288        unsigned int ready:1;
289    } state;
290    wchar_t *wstr;              /* wchar_t representation (null-terminated) */
291} PyASCIIObject;
292
293/* Non-ASCII strings allocated through PyUnicode_New use the
294   PyCompactUnicodeOject structure. state.compact is set, and the data
295   immediately follow the structure. */
296typedef struct {
297    PyASCIIObject _base;
298    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
299                                 * terminating \0. */
300    char *utf8;                 /* UTF-8 representation (null-terminated) */
301    Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
302                                 * surrogates count as two code points. */
303} PyCompactUnicodeObject;
304
305/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
306   PyUnicodeObject structure. The actual string data is initially in the wstr
307   block, and copied into the data block using _PyUnicode_Ready. */
308typedef struct {
309    PyCompactUnicodeObject _base;
310    union {
311        void *any;
312        Py_UCS1 *latin1;
313        Py_UCS2 *ucs2;
314        Py_UCS4 *ucs4;
315    } data;                     /* Canonical, smallest-form Unicode buffer */
316} PyUnicodeObject;
317#endif
318
319PyAPI_DATA(PyTypeObject) PyUnicode_Type;
320PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
321
322#define PyUnicode_Check(op) \
323                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
324#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
325
326/* Fast access macros */
327#ifndef Py_LIMITED_API
328
329#define PyUnicode_WSTR_LENGTH(op) \
330    (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
331     ((PyASCIIObject*)op)->length :                    \
332     ((PyCompactUnicodeObject*)op)->wstr_length)
333
334/* Returns the deprecated Py_UNICODE representation's size in code units
335   (this includes surrogate pairs as 2 units).
336   If the Py_UNICODE representation is not available, it will be computed
337   on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
338
339#define PyUnicode_GET_SIZE(op) \
340    (assert(PyUnicode_Check(op)), \
341     (((PyASCIIObject *)(op))->wstr) ? \
342        PyUnicode_WSTR_LENGTH(op) :                   \
343        ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
344         PyUnicode_WSTR_LENGTH(op)))
345
346#define PyUnicode_GET_DATA_SIZE(op) \
347    (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
348
349/* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
350   representation on demand.  Using this macro is very inefficient now,
351   try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
352   use PyUnicode_WRITE() and PyUnicode_READ(). */
353
354#define PyUnicode_AS_UNICODE(op) \
355    (assert(PyUnicode_Check(op)), \
356     (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
357      PyUnicode_AsUnicode((PyObject *)(op)))
358
359#define PyUnicode_AS_DATA(op) \
360    ((const char *)(PyUnicode_AS_UNICODE(op)))
361
362
363/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
364
365/* Values for PyUnicodeObject.state: */
366
367/* Interning state. */
368#define SSTATE_NOT_INTERNED 0
369#define SSTATE_INTERNED_MORTAL 1
370#define SSTATE_INTERNED_IMMORTAL 2
371
372/* Return true if the string contains only ASCII characters, or 0 if not. The
373   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
374   or Ready calls are performed. */
375#define PyUnicode_IS_ASCII(op)                 \
376    (((PyASCIIObject*)op)->state.ascii)
377
378/* Return true if the string is compact or 0 if not.
379   No type checks or Ready calls are performed. */
380#define PyUnicode_IS_COMPACT(op) \
381    (((PyASCIIObject*)(op))->state.compact)
382
383/* Return true if the string is a compact ASCII string (use PyASCIIObject
384   structure), or 0 if not.  No type checks or Ready calls are performed. */
385#define PyUnicode_IS_COMPACT_ASCII(op)                 \
386    (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
387
388/* String contains only wstr byte characters.  This is only possible
389   when the string was created with a legacy API and _PyUnicode_Ready()
390   has not been called yet.  */
391#define PyUnicode_WCHAR_KIND 0
392
393/* Return values of the PyUnicode_KIND() macro: */
394
395#define PyUnicode_1BYTE_KIND 1
396#define PyUnicode_2BYTE_KIND 2
397#define PyUnicode_4BYTE_KIND 3
398
399
400/* Return the number of bytes the string uses to represent single characters,
401   this can be 1, 2 or 4.
402
403   See also PyUnicode_KIND_SIZE(). */
404#define PyUnicode_CHARACTER_SIZE(op) \
405    (1 << (PyUnicode_KIND(op) - 1))
406
407/* Return pointers to the canonical representation casted as unsigned char,
408   Py_UCS2, or Py_UCS4 for direct character access.
409   No checks are performed, use PyUnicode_CHARACTER_SIZE or
410   PyUnicode_KIND() before to ensure these will work correctly. */
411
412#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
413#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
414#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
415
416/* Return one of the PyUnicode_*_KIND values defined above. */
417#define PyUnicode_KIND(op) \
418    (assert(PyUnicode_Check(op)), \
419     assert(PyUnicode_IS_READY(op)),            \
420     ((PyASCIIObject *)(op))->state.kind)
421
422/* Return a void pointer to the raw unicode buffer. */
423#define _PyUnicode_COMPACT_DATA(op)                     \
424    (PyUnicode_IS_COMPACT_ASCII(op) ?                   \
425     ((void*)((PyASCIIObject*)(op) + 1)) :              \
426     ((void*)((PyCompactUnicodeObject*)(op) + 1)))
427
428#define _PyUnicode_NONCOMPACT_DATA(op)                  \
429    (assert(((PyUnicodeObject*)(op))->data.any),        \
430     ((((PyUnicodeObject *)(op))->data.any)))
431
432#define PyUnicode_DATA(op) \
433    (assert(PyUnicode_Check(op)), \
434     PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) :   \
435     _PyUnicode_NONCOMPACT_DATA(op))
436
437/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
438   The index is a character index, the result is a size in bytes.
439
440   See also PyUnicode_CHARACTER_SIZE(). */
441#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
442
443/* In the access macros below, "kind" may be evaluated more than once.
444   All other macro parameters are evaluated exactly once, so it is safe
445   to put side effects into them (such as increasing the index). */
446
447/* Write into the canonical representation, this macro does not do any sanity
448   checks and is intended for usage in loops.  The caller should cache the
449   kind and data pointers optained form other macro calls.
450   index is the index in the string (starts at 0) and value is the new
451   code point value which shoule be written to that location. */
452#define PyUnicode_WRITE(kind, data, index, value) \
453    do { \
454        switch ((kind)) { \
455        case PyUnicode_1BYTE_KIND: { \
456            ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
457            break; \
458        } \
459        case PyUnicode_2BYTE_KIND: { \
460            ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
461            break; \
462        } \
463        default: { \
464            assert((kind) == PyUnicode_4BYTE_KIND); \
465            ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
466        } \
467        } \
468    } while (0)
469
470/* Read a code point form the string's canonical representation.  No checks
471   or ready calls are performed. */
472#define PyUnicode_READ(kind, data, index) \
473    ((Py_UCS4) \
474    ((kind) == PyUnicode_1BYTE_KIND ? \
475        ((const Py_UCS1 *)(data))[(index)] : \
476        ((kind) == PyUnicode_2BYTE_KIND ? \
477            ((const Py_UCS2 *)(data))[(index)] : \
478            ((const Py_UCS4 *)(data))[(index)] \
479        ) \
480    ))
481
482/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
483   calls PyUnicode_KIND() and might call it twice.  For single reads, use
484   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
485   cache kind and use PyUnicode_READ instead. */
486#define PyUnicode_READ_CHAR(unicode, index) \
487    (assert(PyUnicode_Check(unicode)),          \
488     assert(PyUnicode_IS_READY(unicode)),       \
489     (Py_UCS4)                                  \
490        (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
491            ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
492            (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
493                ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
494                ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
495            ) \
496        ))
497
498/* Returns the length of the unicode string. The caller has to make sure that
499   the string has it's canonical representation set before calling
500   this macro.  Call PyUnicode_(FAST_)Ready to ensure that. */
501#define PyUnicode_GET_LENGTH(op)                \
502    (assert(PyUnicode_Check(op)),               \
503     assert(PyUnicode_IS_READY(op)),            \
504     ((PyASCIIObject *)(op))->length)
505
506
507/* Fast check to determine whether an object is ready. Equivalent to
508   PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
509
510#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
511
512/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
513   case.  If the canonical representation is not yet set, it will still call
514   _PyUnicode_Ready().
515   Returns 0 on success and -1 on errors. */
516#define PyUnicode_READY(op)                        \
517    (assert(PyUnicode_Check(op)),                       \
518     (PyUnicode_IS_READY(op) ?                          \
519      0 : _PyUnicode_Ready((PyObject *)(op))))
520
521/* Return a maximum character value which is suitable for creating another
522   string based on op.  This is always an approximation but more efficient
523   than interating over the string. */
524#define PyUnicode_MAX_CHAR_VALUE(op) \
525    (assert(PyUnicode_IS_READY(op)),                                    \
526     (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f:                            \
527      (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ?                     \
528       (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
529        (0x7fU) : (0xffU)                                                 \
530           ) :                                                          \
531       (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ?                    \
532        (0xffffU) : (0x10ffffU)                                           \
533           ))))
534
535#endif
536
537/* --- Constants ---------------------------------------------------------- */
538
539/* This Unicode character will be used as replacement character during
540   decoding if the errors argument is set to "replace". Note: the
541   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
542   Unicode 3.0. */
543
544#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
545
546/* === Public API ========================================================= */
547
548/* --- Plain Py_UNICODE --------------------------------------------------- */
549
550/* With PEP 393, this is the recommended way to allocate a new unicode object.
551   This function will allocate the object and its buffer in a single memory
552   block.  Objects created using this function are not resizable. */
553#ifndef Py_LIMITED_API
554PyAPI_FUNC(PyObject*) PyUnicode_New(
555    Py_ssize_t size,            /* Number of code points in the new string */
556    Py_UCS4 maxchar             /* maximum code point value in the string */
557    );
558#endif
559
560/* Initializes the canonical string representation from a the deprecated
561   wstr/Py_UNICODE representation. This function is used to convert Unicode
562   objects which were created using the old API to the new flexible format
563   introduced with PEP 393.
564
565   Don't call this function directly, use the public PyUnicode_READY() macro
566   instead. */
567#ifndef Py_LIMITED_API
568PyAPI_FUNC(int) _PyUnicode_Ready(
569    PyObject *unicode           /* Unicode object */
570    );
571#endif
572
573/* Get a copy of a Unicode string. */
574PyAPI_FUNC(PyObject*) PyUnicode_Copy(
575    PyObject *unicode
576    );
577
578/* Copy character from one unicode object into another, this function performs
579   character conversion when necessary and falls back to memcpy if possible.
580
581   Fail if to is too small (smaller than how_many or smaller than
582   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
583   kind(to), or if to has more than 1 reference.
584
585   Return the number of written character, or return -1 and raise an exception
586   on error.
587
588   Pseudo-code:
589
590       how_many = min(how_many, len(from) - from_start)
591       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
592       return how_many
593
594   Note: The function doesn't write a terminating null character.
595   */
596#ifndef Py_LIMITED_API
597PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
598    PyObject *to,
599    Py_ssize_t to_start,
600    PyObject *from,
601    Py_ssize_t from_start,
602    Py_ssize_t how_many
603    );
604#endif
605
606/* Create a Unicode Object from the Py_UNICODE buffer u of the given
607   size.
608
609   u may be NULL which causes the contents to be undefined. It is the
610   user's responsibility to fill in the needed data afterwards. Note
611   that modifying the Unicode object contents after construction is
612   only allowed if u was set to NULL.
613
614   The buffer is copied into the new object. */
615
616#ifndef Py_LIMITED_API
617PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
618    const Py_UNICODE *u,        /* Unicode buffer */
619    Py_ssize_t size             /* size of buffer */
620    );
621#endif
622
623/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
624PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
625    const char *u,             /* UTF-8 encoded string */
626    Py_ssize_t size            /* size of buffer */
627    );
628
629/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
630   UTF-8 encoded bytes.  The size is determined with strlen(). */
631PyAPI_FUNC(PyObject*) PyUnicode_FromString(
632    const char *u              /* UTF-8 encoded string */
633    );
634
635#ifndef Py_LIMITED_API
636PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
637    int kind,
638    const void *buffer,
639    Py_ssize_t size);
640#endif
641
642PyAPI_FUNC(PyObject*) PyUnicode_Substring(
643    PyObject *str,
644    Py_ssize_t start,
645    Py_ssize_t end);
646
647/* Copy the string into a UCS4 buffer including the null character is copy_null
648   is set. Return NULL and raise an exception on error. Raise a ValueError if
649   the buffer is smaller than the string. Return buffer on success.
650
651   buflen is the length of the buffer in (Py_UCS4) characters. */
652PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
653    PyObject *unicode,
654    Py_UCS4* buffer,
655    Py_ssize_t buflen,
656    int copy_null);
657
658/* Copy the string into a UCS4 buffer. A new buffer is allocated using
659 * PyMem_Malloc; if this fails, NULL is returned with a memory error
660   exception set. */
661PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
662
663/* Return a read-only pointer to the Unicode object's internal
664   Py_UNICODE buffer.
665   If the wchar_t/Py_UNICODE representation is not yet available, this
666   function will calculate it. */
667
668#ifndef Py_LIMITED_API
669PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
670    PyObject *unicode           /* Unicode object */
671    );
672#endif
673
674/* Return a read-only pointer to the Unicode object's internal
675   Py_UNICODE buffer and save the length at size.
676   If the wchar_t/Py_UNICODE representation is not yet available, this
677   function will calculate it. */
678
679#ifndef Py_LIMITED_API
680PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
681    PyObject *unicode,          /* Unicode object */
682    Py_ssize_t *size            /* location where to save the length */
683    );
684#endif
685
686/* Get the length of the Unicode object. */
687
688PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
689    PyObject *unicode
690);
691
692/* Get the number of Py_UNICODE units in the
693   string representation. */
694
695PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
696    PyObject *unicode           /* Unicode object */
697    );
698
699/* Read a character from the string. */
700
701PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
702    PyObject *unicode,
703    Py_ssize_t index
704    );
705
706/* Write a character to the string. The string must have been created through
707   PyUnicode_New, must not be shared, and must not have been hashed yet.
708
709   Return 0 on success, -1 on error. */
710
711PyAPI_FUNC(int) PyUnicode_WriteChar(
712    PyObject *unicode,
713    Py_ssize_t index,
714    Py_UCS4 character
715    );
716
717#ifndef Py_LIMITED_API
718/* Get the maximum ordinal for a Unicode character. */
719PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
720#endif
721
722/* Resize an already allocated Unicode object to the new size length.
723
724   *unicode is modified to point to the new (resized) object and 0
725   returned on success.
726
727   This API may only be called by the function which also called the
728   Unicode constructor. The refcount on the object must be 1. Otherwise,
729   an error is returned.
730
731   Error handling is implemented as follows: an exception is set, -1
732   is returned and *unicode left untouched.
733
734*/
735
736PyAPI_FUNC(int) PyUnicode_Resize(
737    PyObject **unicode,         /* Pointer to the Unicode object */
738    Py_ssize_t length           /* New length */
739    );
740
741/* Coerce obj to an Unicode object and return a reference with
742   *incremented* refcount.
743
744   Coercion is done in the following way:
745
746   1. bytes, bytearray and other char buffer compatible objects are decoded
747      under the assumptions that they contain data using the UTF-8
748      encoding. Decoding is done in "strict" mode.
749
750   2. All other objects (including Unicode objects) raise an
751      exception.
752
753   The API returns NULL in case of an error. The caller is responsible
754   for decref'ing the returned objects.
755
756*/
757
758PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
759    register PyObject *obj,     /* Object */
760    const char *encoding,       /* encoding */
761    const char *errors          /* error handling */
762    );
763
764/* Coerce obj to an Unicode object and return a reference with
765   *incremented* refcount.
766
767   Unicode objects are passed back as-is (subclasses are converted to
768   true Unicode objects), all other objects are delegated to
769   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
770   using UTF-8 encoding as basis for decoding the object.
771
772   The API returns NULL in case of an error. The caller is responsible
773   for decref'ing the returned objects.
774
775*/
776
777PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
778    register PyObject *obj      /* Object */
779    );
780
781PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
782    const char *format,   /* ASCII-encoded string  */
783    va_list vargs
784    );
785PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
786    const char *format,   /* ASCII-encoded string  */
787    ...
788    );
789
790#ifndef Py_LIMITED_API
791/* Format the object based on the format_spec, as defined in PEP 3101
792   (Advanced String Formatting). */
793PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
794                                                 PyObject *format_spec,
795                                                 Py_ssize_t start,
796                                                 Py_ssize_t end);
797#endif
798
799PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
800PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
801PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
802    const char *u              /* UTF-8 encoded string */
803    );
804#ifndef Py_LIMITED_API
805PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
806#endif
807
808/* Use only if you know it's a string */
809#define PyUnicode_CHECK_INTERNED(op) \
810    (((PyASCIIObject *)(op))->state.interned)
811
812/* --- wchar_t support for platforms which support it --------------------- */
813
814#ifdef HAVE_WCHAR_H
815
816/* Create a Unicode Object from the wchar_t buffer w of the given
817   size.
818
819   The buffer is copied into the new object. */
820
821PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
822    register const wchar_t *w,  /* wchar_t buffer */
823    Py_ssize_t size             /* size of buffer */
824    );
825
826/* Copies the Unicode Object contents into the wchar_t buffer w.  At
827   most size wchar_t characters are copied.
828
829   Note that the resulting wchar_t string may or may not be
830   0-terminated.  It is the responsibility of the caller to make sure
831   that the wchar_t string is 0-terminated in case this is required by
832   the application.
833
834   Returns the number of wchar_t characters copied (excluding a
835   possibly trailing 0-termination character) or -1 in case of an
836   error. */
837
838PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
839    PyObject *unicode,          /* Unicode object */
840    register wchar_t *w,        /* wchar_t buffer */
841    Py_ssize_t size             /* size of buffer */
842    );
843
844/* Convert the Unicode object to a wide character string. The output string
845   always ends with a nul character. If size is not NULL, write the number of
846   wide characters (excluding the null character) into *size.
847
848   Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
849   on success. On error, returns NULL, *size is undefined and raises a
850   MemoryError. */
851
852PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
853    PyObject *unicode,          /* Unicode object */
854    Py_ssize_t *size            /* number of characters of the result */
855    );
856
857#ifndef Py_LIMITED_API
858PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
859#endif
860
861#endif
862
863/* --- Unicode ordinals --------------------------------------------------- */
864
865/* Create a Unicode Object from the given Unicode code point ordinal.
866
867   The ordinal must be in range(0x10000) on narrow Python builds
868   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
869   raised in case it is not.
870
871*/
872
873PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
874
875/* --- Free-list management ----------------------------------------------- */
876
877/* Clear the free list used by the Unicode implementation.
878
879   This can be used to release memory used for objects on the free
880   list back to the Python memory allocator.
881
882*/
883
884PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
885
886/* === Builtin Codecs =====================================================
887
888   Many of these APIs take two arguments encoding and errors. These
889   parameters encoding and errors have the same semantics as the ones
890   of the builtin str() API.
891
892   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
893
894   Error handling is set by errors which may also be set to NULL
895   meaning to use the default handling defined for the codec. Default
896   error handling for all builtin codecs is "strict" (ValueErrors are
897   raised).
898
899   The codecs all use a similar interface. Only deviation from the
900   generic ones are documented.
901
902*/
903
904/* --- Manage the default encoding ---------------------------------------- */
905
906/* Returns a pointer to the default encoding (UTF-8) of the
907   Unicode object unicode and the size of the encoded representation
908   in bytes stored in *size.
909
910   In case of an error, no *size is set.
911
912   This funcation caches the UTF-8 encoded string in the unicodeobject
913   and subsequent calls will return the same string.  The memory is relased
914   when the unicodeobject is deallocated.
915
916   _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
917   support the previous internal function with the same behaviour.
918
919   *** This API is for interpreter INTERNAL USE ONLY and will likely
920   *** be removed or changed in the future.
921
922   *** If you need to access the Unicode object as UTF-8 bytes string,
923   *** please use PyUnicode_AsUTF8String() instead.
924*/
925
926#ifndef Py_LIMITED_API
927PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
928    PyObject *unicode,
929    Py_ssize_t *size);
930#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
931#endif
932
933/* Returns a pointer to the default encoding (UTF-8) of the
934   Unicode object unicode.
935
936   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
937   in the unicodeobject.
938
939   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
940   support the previous internal function with the same behaviour.
941
942   Use of this API is DEPRECATED since no size information can be
943   extracted from the returned data.
944
945   *** This API is for interpreter INTERNAL USE ONLY and will likely
946   *** be removed or changed for Python 3.1.
947
948   *** If you need to access the Unicode object as UTF-8 bytes string,
949   *** please use PyUnicode_AsUTF8String() instead.
950
951*/
952
953#ifndef Py_LIMITED_API
954PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
955#define _PyUnicode_AsString PyUnicode_AsUTF8
956#endif
957
958/* Returns "utf-8".  */
959
960PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
961
962/* --- Generic Codecs ----------------------------------------------------- */
963
964/* Create a Unicode object by decoding the encoded string s of the
965   given size. */
966
967PyAPI_FUNC(PyObject*) PyUnicode_Decode(
968    const char *s,              /* encoded string */
969    Py_ssize_t size,            /* size of buffer */
970    const char *encoding,       /* encoding */
971    const char *errors          /* error handling */
972    );
973
974/* Decode a Unicode object unicode and return the result as Python
975   object. */
976
977PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
978    PyObject *unicode,          /* Unicode object */
979    const char *encoding,       /* encoding */
980    const char *errors          /* error handling */
981    );
982
983/* Decode a Unicode object unicode and return the result as Unicode
984   object. */
985
986PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
987    PyObject *unicode,          /* Unicode object */
988    const char *encoding,       /* encoding */
989    const char *errors          /* error handling */
990    );
991
992/* Encodes a Py_UNICODE buffer of the given size and returns a
993   Python string object. */
994
995#ifndef Py_LIMITED_API
996PyAPI_FUNC(PyObject*) PyUnicode_Encode(
997    const Py_UNICODE *s,        /* Unicode char buffer */
998    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
999    const char *encoding,       /* encoding */
1000    const char *errors          /* error handling */
1001    );
1002#endif
1003
1004/* Encodes a Unicode object and returns the result as Python
1005   object. */
1006
1007PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
1008    PyObject *unicode,          /* Unicode object */
1009    const char *encoding,       /* encoding */
1010    const char *errors          /* error handling */
1011    );
1012
1013/* Encodes a Unicode object and returns the result as Python string
1014   object. */
1015
1016PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
1017    PyObject *unicode,          /* Unicode object */
1018    const char *encoding,       /* encoding */
1019    const char *errors          /* error handling */
1020    );
1021
1022/* Encodes a Unicode object and returns the result as Unicode
1023   object. */
1024
1025PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
1026    PyObject *unicode,          /* Unicode object */
1027    const char *encoding,       /* encoding */
1028    const char *errors          /* error handling */
1029    );
1030
1031/* Build an encoding map. */
1032
1033PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1034    PyObject* string            /* 256 character map */
1035   );
1036
1037/* --- UTF-7 Codecs ------------------------------------------------------- */
1038
1039PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
1040    const char *string,         /* UTF-7 encoded string */
1041    Py_ssize_t length,          /* size of string */
1042    const char *errors          /* error handling */
1043    );
1044
1045PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
1046    const char *string,         /* UTF-7 encoded string */
1047    Py_ssize_t length,          /* size of string */
1048    const char *errors,         /* error handling */
1049    Py_ssize_t *consumed        /* bytes consumed */
1050    );
1051
1052#ifndef Py_LIMITED_API
1053PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
1054    const Py_UNICODE *data,     /* Unicode char buffer */
1055    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1056    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1057    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1058    const char *errors          /* error handling */
1059    );
1060#endif
1061
1062/* --- UTF-8 Codecs ------------------------------------------------------- */
1063
1064PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
1065    const char *string,         /* UTF-8 encoded string */
1066    Py_ssize_t length,          /* size of string */
1067    const char *errors          /* error handling */
1068    );
1069
1070PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
1071    const char *string,         /* UTF-8 encoded string */
1072    Py_ssize_t length,          /* size of string */
1073    const char *errors,         /* error handling */
1074    Py_ssize_t *consumed        /* bytes consumed */
1075    );
1076
1077PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
1078    PyObject *unicode           /* Unicode object */
1079    );
1080
1081#ifndef Py_LIMITED_API
1082PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1083    PyObject *unicode,
1084    const char *errors);
1085
1086PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
1087    const Py_UNICODE *data,     /* Unicode char buffer */
1088    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1089    const char *errors          /* error handling */
1090    );
1091#endif
1092
1093/* --- UTF-32 Codecs ------------------------------------------------------ */
1094
1095/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1096   the corresponding Unicode object.
1097
1098   errors (if non-NULL) defines the error handling. It defaults
1099   to "strict".
1100
1101   If byteorder is non-NULL, the decoder starts decoding using the
1102   given byte order:
1103
1104    *byteorder == -1: little endian
1105    *byteorder == 0:  native order
1106    *byteorder == 1:  big endian
1107
1108   In native mode, the first four bytes of the stream are checked for a
1109   BOM mark. If found, the BOM mark is analysed, the byte order
1110   adjusted and the BOM skipped.  In the other modes, no BOM mark
1111   interpretation is done. After completion, *byteorder is set to the
1112   current byte order at the end of input data.
1113
1114   If byteorder is NULL, the codec starts in native order mode.
1115
1116*/
1117
1118PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
1119    const char *string,         /* UTF-32 encoded string */
1120    Py_ssize_t length,          /* size of string */
1121    const char *errors,         /* error handling */
1122    int *byteorder              /* pointer to byteorder to use
1123                                   0=native;-1=LE,1=BE; updated on
1124                                   exit */
1125    );
1126
1127PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
1128    const char *string,         /* UTF-32 encoded string */
1129    Py_ssize_t length,          /* size of string */
1130    const char *errors,         /* error handling */
1131    int *byteorder,             /* pointer to byteorder to use
1132                                   0=native;-1=LE,1=BE; updated on
1133                                   exit */
1134    Py_ssize_t *consumed        /* bytes consumed */
1135    );
1136
1137/* Returns a Python string using the UTF-32 encoding in native byte
1138   order. The string always starts with a BOM mark.  */
1139
1140PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
1141    PyObject *unicode           /* Unicode object */
1142    );
1143
1144/* Returns a Python string object holding the UTF-32 encoded value of
1145   the Unicode data.
1146
1147   If byteorder is not 0, output is written according to the following
1148   byte order:
1149
1150   byteorder == -1: little endian
1151   byteorder == 0:  native byte order (writes a BOM mark)
1152   byteorder == 1:  big endian
1153
1154   If byteorder is 0, the output string will always start with the
1155   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1156   prepended.
1157
1158*/
1159
1160#ifndef Py_LIMITED_API
1161PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
1162    const Py_UNICODE *data,     /* Unicode char buffer */
1163    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1164    const char *errors,         /* error handling */
1165    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1166    );
1167#endif
1168
1169/* --- UTF-16 Codecs ------------------------------------------------------ */
1170
1171/* Decodes length bytes from a UTF-16 encoded buffer string and returns
1172   the corresponding Unicode object.
1173
1174   errors (if non-NULL) defines the error handling. It defaults
1175   to "strict".
1176
1177   If byteorder is non-NULL, the decoder starts decoding using the
1178   given byte order:
1179
1180    *byteorder == -1: little endian
1181    *byteorder == 0:  native order
1182    *byteorder == 1:  big endian
1183
1184   In native mode, the first two bytes of the stream are checked for a
1185   BOM mark. If found, the BOM mark is analysed, the byte order
1186   adjusted and the BOM skipped.  In the other modes, no BOM mark
1187   interpretation is done. After completion, *byteorder is set to the
1188   current byte order at the end of input data.
1189
1190   If byteorder is NULL, the codec starts in native order mode.
1191
1192*/
1193
1194PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
1195    const char *string,         /* UTF-16 encoded string */
1196    Py_ssize_t length,          /* size of string */
1197    const char *errors,         /* error handling */
1198    int *byteorder              /* pointer to byteorder to use
1199                                   0=native;-1=LE,1=BE; updated on
1200                                   exit */
1201    );
1202
1203PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
1204    const char *string,         /* UTF-16 encoded string */
1205    Py_ssize_t length,          /* size of string */
1206    const char *errors,         /* error handling */
1207    int *byteorder,             /* pointer to byteorder to use
1208                                   0=native;-1=LE,1=BE; updated on
1209                                   exit */
1210    Py_ssize_t *consumed        /* bytes consumed */
1211    );
1212
1213/* Returns a Python string using the UTF-16 encoding in native byte
1214   order. The string always starts with a BOM mark.  */
1215
1216PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
1217    PyObject *unicode           /* Unicode object */
1218    );
1219
1220/* Returns a Python string object holding the UTF-16 encoded value of
1221   the Unicode data.
1222
1223   If byteorder is not 0, output is written according to the following
1224   byte order:
1225
1226   byteorder == -1: little endian
1227   byteorder == 0:  native byte order (writes a BOM mark)
1228   byteorder == 1:  big endian
1229
1230   If byteorder is 0, the output string will always start with the
1231   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1232   prepended.
1233
1234   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1235   UCS-2. This trick makes it possible to add full UTF-16 capabilities
1236   at a later point without compromising the APIs.
1237
1238*/
1239
1240#ifndef Py_LIMITED_API
1241PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
1242    const Py_UNICODE *data,     /* Unicode char buffer */
1243    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1244    const char *errors,         /* error handling */
1245    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1246    );
1247#endif
1248
1249/* --- Unicode-Escape Codecs ---------------------------------------------- */
1250
1251PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
1252    const char *string,         /* Unicode-Escape encoded string */
1253    Py_ssize_t length,          /* size of string */
1254    const char *errors          /* error handling */
1255    );
1256
1257PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
1258    PyObject *unicode           /* Unicode object */
1259    );
1260
1261#ifndef Py_LIMITED_API
1262PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
1263    const Py_UNICODE *data,     /* Unicode char buffer */
1264    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1265    );
1266#endif
1267
1268/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1269
1270PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
1271    const char *string,         /* Raw-Unicode-Escape encoded string */
1272    Py_ssize_t length,          /* size of string */
1273    const char *errors          /* error handling */
1274    );
1275
1276PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
1277    PyObject *unicode           /* Unicode object */
1278    );
1279
1280#ifndef Py_LIMITED_API
1281PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
1282    const Py_UNICODE *data,     /* Unicode char buffer */
1283    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1284    );
1285#endif
1286
1287/* --- Unicode Internal Codec ---------------------------------------------
1288
1289    Only for internal use in _codecsmodule.c */
1290
1291#ifndef Py_LIMITED_API
1292PyObject *_PyUnicode_DecodeUnicodeInternal(
1293    const char *string,
1294    Py_ssize_t length,
1295    const char *errors
1296    );
1297#endif
1298
1299/* --- Latin-1 Codecs -----------------------------------------------------
1300
1301   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1302
1303*/
1304
1305PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1306    const char *string,         /* Latin-1 encoded string */
1307    Py_ssize_t length,          /* size of string */
1308    const char *errors          /* error handling */
1309    );
1310
1311PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1312    PyObject *unicode           /* Unicode object */
1313    );
1314
1315#ifndef Py_LIMITED_API
1316PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1317    PyObject* unicode,
1318    const char* errors);
1319
1320PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1321    const Py_UNICODE *data,     /* Unicode char buffer */
1322    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1323    const char *errors          /* error handling */
1324    );
1325#endif
1326
1327/* --- ASCII Codecs -------------------------------------------------------
1328
1329   Only 7-bit ASCII data is excepted. All other codes generate errors.
1330
1331*/
1332
1333PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1334    const char *string,         /* ASCII encoded string */
1335    Py_ssize_t length,          /* size of string */
1336    const char *errors          /* error handling */
1337    );
1338
1339PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1340    PyObject *unicode           /* Unicode object */
1341    );
1342
1343#ifndef Py_LIMITED_API
1344PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1345    PyObject* unicode,
1346    const char* errors);
1347
1348PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1349    const Py_UNICODE *data,     /* Unicode char buffer */
1350    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1351    const char *errors          /* error handling */
1352    );
1353#endif
1354
1355/* --- Character Map Codecs -----------------------------------------------
1356
1357   This codec uses mappings to encode and decode characters.
1358
1359   Decoding mappings must map single string characters to single
1360   Unicode characters, integers (which are then interpreted as Unicode
1361   ordinals) or None (meaning "undefined mapping" and causing an
1362   error).
1363
1364   Encoding mappings must map single Unicode characters to single
1365   string characters, integers (which are then interpreted as Latin-1
1366   ordinals) or None (meaning "undefined mapping" and causing an
1367   error).
1368
1369   If a character lookup fails with a LookupError, the character is
1370   copied as-is meaning that its ordinal value will be interpreted as
1371   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1372   to contain those mappings which map characters to different code
1373   points.
1374
1375*/
1376
1377PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1378    const char *string,         /* Encoded string */
1379    Py_ssize_t length,          /* size of string */
1380    PyObject *mapping,          /* character mapping
1381                                   (char ordinal -> unicode ordinal) */
1382    const char *errors          /* error handling */
1383    );
1384
1385PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1386    PyObject *unicode,          /* Unicode object */
1387    PyObject *mapping           /* character mapping
1388                                   (unicode ordinal -> char ordinal) */
1389    );
1390
1391#ifndef Py_LIMITED_API
1392PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1393    const Py_UNICODE *data,     /* Unicode char buffer */
1394    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1395    PyObject *mapping,          /* character mapping
1396                                   (unicode ordinal -> char ordinal) */
1397    const char *errors          /* error handling */
1398    );
1399#endif
1400
1401/* Translate a Py_UNICODE buffer of the given length by applying a
1402   character mapping table to it and return the resulting Unicode
1403   object.
1404
1405   The mapping table must map Unicode ordinal integers to Unicode
1406   ordinal integers or None (causing deletion of the character).
1407
1408   Mapping tables may be dictionaries or sequences. Unmapped character
1409   ordinals (ones which cause a LookupError) are left untouched and
1410   are copied as-is.
1411
1412*/
1413
1414#ifndef Py_LIMITED_API
1415PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1416    const Py_UNICODE *data,     /* Unicode char buffer */
1417    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1418    PyObject *table,            /* Translate table */
1419    const char *errors          /* error handling */
1420    );
1421#endif
1422
1423#ifdef HAVE_MBCS
1424
1425/* --- MBCS codecs for Windows -------------------------------------------- */
1426
1427PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1428    const char *string,         /* MBCS encoded string */
1429    Py_ssize_t length,              /* size of string */
1430    const char *errors          /* error handling */
1431    );
1432
1433PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1434    const char *string,         /* MBCS encoded string */
1435    Py_ssize_t length,          /* size of string */
1436    const char *errors,         /* error handling */
1437    Py_ssize_t *consumed        /* bytes consumed */
1438    );
1439
1440PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1441    PyObject *unicode           /* Unicode object */
1442    );
1443
1444#ifndef Py_LIMITED_API
1445PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1446    const Py_UNICODE *data,     /* Unicode char buffer */
1447    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1448    const char *errors          /* error handling */
1449    );
1450#endif
1451
1452#endif /* HAVE_MBCS */
1453
1454/* --- Decimal Encoder ---------------------------------------------------- */
1455
1456/* Takes a Unicode string holding a decimal value and writes it into
1457   an output buffer using standard ASCII digit codes.
1458
1459   The output buffer has to provide at least length+1 bytes of storage
1460   area. The output string is 0-terminated.
1461
1462   The encoder converts whitespace to ' ', decimal characters to their
1463   corresponding ASCII digit and all other Latin-1 characters except
1464   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1465   are treated as errors. This includes embedded NULL bytes.
1466
1467   Error handling is defined by the errors argument:
1468
1469      NULL or "strict": raise a ValueError
1470      "ignore": ignore the wrong characters (these are not copied to the
1471                output buffer)
1472      "replace": replaces illegal characters with '?'
1473
1474   Returns 0 on success, -1 on failure.
1475
1476*/
1477
1478#ifndef Py_LIMITED_API
1479PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1480    Py_UNICODE *s,              /* Unicode buffer */
1481    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1482    char *output,               /* Output buffer; must have size >= length */
1483    const char *errors          /* error handling */
1484    );
1485#endif
1486
1487/* Transforms code points that have decimal digit property to the
1488   corresponding ASCII digit code points.
1489
1490   Returns a new Unicode string on success, NULL on failure.
1491*/
1492
1493#ifndef Py_LIMITED_API
1494PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1495    Py_UNICODE *s,              /* Unicode buffer */
1496    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
1497    );
1498#endif
1499
1500/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1501   as argument instead of a raw buffer and length.  This function additionally
1502   transforms spaces to ASCII because this is what the callers in longobject,
1503   floatobject, and complexobject did anyways. */
1504
1505#ifndef Py_LIMITED_API
1506PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1507    PyObject *unicode           /* Unicode object */
1508    );
1509#endif
1510
1511/* --- File system encoding ---------------------------------------------- */
1512
1513/* ParseTuple converter: encode str objects to bytes using
1514   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1515
1516PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1517
1518/* ParseTuple converter: decode bytes objects to unicode using
1519   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1520
1521PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1522
1523/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1524   and the "surrogateescape" error handler.
1525
1526   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1527   encoding.
1528
1529   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1530*/
1531
1532PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1533    const char *s               /* encoded string */
1534    );
1535
1536/* Decode a string using Py_FileSystemDefaultEncoding
1537   and the "surrogateescape" error handler.
1538
1539   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1540   encoding.
1541*/
1542
1543PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1544    const char *s,               /* encoded string */
1545    Py_ssize_t size              /* size */
1546    );
1547
1548/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1549   "surrogateescape" error handler, and return bytes.
1550
1551   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1552   encoding.
1553*/
1554
1555PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1556    PyObject *unicode
1557    );
1558
1559/* --- Methods & Slots ----------------------------------------------------
1560
1561   These are capable of handling Unicode objects and strings on input
1562   (we refer to them as strings in the descriptions) and return
1563   Unicode objects or integers as apporpriate. */
1564
1565/* Concat two strings giving a new Unicode string. */
1566
1567PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1568    PyObject *left,             /* Left string */
1569    PyObject *right             /* Right string */
1570    );
1571
1572/* Concat two strings and put the result in *pleft
1573   (sets *pleft to NULL on error) */
1574
1575PyAPI_FUNC(void) PyUnicode_Append(
1576    PyObject **pleft,           /* Pointer to left string */
1577    PyObject *right             /* Right string */
1578    );
1579
1580/* Concat two strings, put the result in *pleft and drop the right object
1581   (sets *pleft to NULL on error) */
1582
1583PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1584    PyObject **pleft,           /* Pointer to left string */
1585    PyObject *right             /* Right string */
1586    );
1587
1588/* Split a string giving a list of Unicode strings.
1589
1590   If sep is NULL, splitting will be done at all whitespace
1591   substrings. Otherwise, splits occur at the given separator.
1592
1593   At most maxsplit splits will be done. If negative, no limit is set.
1594
1595   Separators are not included in the resulting list.
1596
1597*/
1598
1599PyAPI_FUNC(PyObject*) PyUnicode_Split(
1600    PyObject *s,                /* String to split */
1601    PyObject *sep,              /* String separator */
1602    Py_ssize_t maxsplit         /* Maxsplit count */
1603    );
1604
1605/* Dito, but split at line breaks.
1606
1607   CRLF is considered to be one line break. Line breaks are not
1608   included in the resulting list. */
1609
1610PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1611    PyObject *s,                /* String to split */
1612    int keepends                /* If true, line end markers are included */
1613    );
1614
1615/* Partition a string using a given separator. */
1616
1617PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1618    PyObject *s,                /* String to partition */
1619    PyObject *sep               /* String separator */
1620    );
1621
1622/* Partition a string using a given separator, searching from the end of the
1623   string. */
1624
1625PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1626    PyObject *s,                /* String to partition */
1627    PyObject *sep               /* String separator */
1628    );
1629
1630/* Split a string giving a list of Unicode strings.
1631
1632   If sep is NULL, splitting will be done at all whitespace
1633   substrings. Otherwise, splits occur at the given separator.
1634
1635   At most maxsplit splits will be done. But unlike PyUnicode_Split
1636   PyUnicode_RSplit splits from the end of the string. If negative,
1637   no limit is set.
1638
1639   Separators are not included in the resulting list.
1640
1641*/
1642
1643PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1644    PyObject *s,                /* String to split */
1645    PyObject *sep,              /* String separator */
1646    Py_ssize_t maxsplit         /* Maxsplit count */
1647    );
1648
1649/* Translate a string by applying a character mapping table to it and
1650   return the resulting Unicode object.
1651
1652   The mapping table must map Unicode ordinal integers to Unicode
1653   ordinal integers or None (causing deletion of the character).
1654
1655   Mapping tables may be dictionaries or sequences. Unmapped character
1656   ordinals (ones which cause a LookupError) are left untouched and
1657   are copied as-is.
1658
1659*/
1660
1661PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1662    PyObject *str,              /* String */
1663    PyObject *table,            /* Translate table */
1664    const char *errors          /* error handling */
1665    );
1666
1667/* Join a sequence of strings using the given separator and return
1668   the resulting Unicode string. */
1669
1670PyAPI_FUNC(PyObject*) PyUnicode_Join(
1671    PyObject *separator,        /* Separator string */
1672    PyObject *seq               /* Sequence object */
1673    );
1674
1675/* Return 1 if substr matches str[start:end] at the given tail end, 0
1676   otherwise. */
1677
1678PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1679    PyObject *str,              /* String */
1680    PyObject *substr,           /* Prefix or Suffix string */
1681    Py_ssize_t start,           /* Start index */
1682    Py_ssize_t end,             /* Stop index */
1683    int direction               /* Tail end: -1 prefix, +1 suffix */
1684    );
1685
1686/* Return the first position of substr in str[start:end] using the
1687   given search direction or -1 if not found. -2 is returned in case
1688   an error occurred and an exception is set. */
1689
1690PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1691    PyObject *str,              /* String */
1692    PyObject *substr,           /* Substring to find */
1693    Py_ssize_t start,           /* Start index */
1694    Py_ssize_t end,             /* Stop index */
1695    int direction               /* Find direction: +1 forward, -1 backward */
1696    );
1697
1698/* Like PyUnicode_Find, but search for single character only. */
1699PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1700    PyObject *str,
1701    Py_UCS4 ch,
1702    Py_ssize_t start,
1703    Py_ssize_t end,
1704    int direction
1705    );
1706
1707/* Count the number of occurrences of substr in str[start:end]. */
1708
1709PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1710    PyObject *str,              /* String */
1711    PyObject *substr,           /* Substring to count */
1712    Py_ssize_t start,           /* Start index */
1713    Py_ssize_t end              /* Stop index */
1714    );
1715
1716/* Replace at most maxcount occurrences of substr in str with replstr
1717   and return the resulting Unicode object. */
1718
1719PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1720    PyObject *str,              /* String */
1721    PyObject *substr,           /* Substring to find */
1722    PyObject *replstr,          /* Substring to replace */
1723    Py_ssize_t maxcount         /* Max. number of replacements to apply;
1724                                   -1 = all */
1725    );
1726
1727/* Compare two strings and return -1, 0, 1 for less than, equal,
1728   greater than resp. */
1729
1730PyAPI_FUNC(int) PyUnicode_Compare(
1731    PyObject *left,             /* Left string */
1732    PyObject *right             /* Right string */
1733    );
1734
1735PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1736    PyObject *left,
1737    const char *right           /* ASCII-encoded string */
1738    );
1739
1740/* Rich compare two strings and return one of the following:
1741
1742   - NULL in case an exception was raised
1743   - Py_True or Py_False for successfuly comparisons
1744   - Py_NotImplemented in case the type combination is unknown
1745
1746   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1747   case the conversion of the arguments to Unicode fails with a
1748   UnicodeDecodeError.
1749
1750   Possible values for op:
1751
1752     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1753
1754*/
1755
1756PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1757    PyObject *left,             /* Left string */
1758    PyObject *right,            /* Right string */
1759    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1760    );
1761
1762/* Apply a argument tuple or dictionary to a format string and return
1763   the resulting Unicode string. */
1764
1765PyAPI_FUNC(PyObject *) PyUnicode_Format(
1766    PyObject *format,           /* Format string */
1767    PyObject *args              /* Argument tuple or dictionary */
1768    );
1769
1770/* Checks whether element is contained in container and return 1/0
1771   accordingly.
1772
1773   element has to coerce to an one element Unicode string. -1 is
1774   returned in case of an error. */
1775
1776PyAPI_FUNC(int) PyUnicode_Contains(
1777    PyObject *container,        /* Container string */
1778    PyObject *element           /* Element string */
1779    );
1780
1781/* Checks whether argument is a valid identifier. */
1782
1783PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1784
1785#ifndef Py_LIMITED_API
1786/* Externally visible for str.strip(unicode) */
1787PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1788    PyUnicodeObject *self,
1789    int striptype,
1790    PyObject *sepobj
1791    );
1792#endif
1793
1794/* Using the current locale, insert the thousands grouping
1795   into the string pointed to by buffer.  For the argument descriptions,
1796   see Objects/stringlib/localeutil.h */
1797
1798#ifndef Py_LIMITED_API
1799PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1800                                                   Py_ssize_t n_buffer,
1801                                                   Py_UNICODE *digits,
1802                                                   Py_ssize_t n_digits,
1803                                                   Py_ssize_t min_width);
1804#endif
1805
1806/* Using explicit passed-in values, insert the thousands grouping
1807   into the string pointed to by buffer.  For the argument descriptions,
1808   see Objects/stringlib/localeutil.h */
1809#ifndef Py_LIMITED_API
1810PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1811    int kind,
1812    void *buffer,
1813    Py_ssize_t n_buffer,
1814    void *digits,
1815    Py_ssize_t n_digits,
1816    Py_ssize_t min_width,
1817    const char *grouping,
1818    const char *thousands_sep);
1819#endif
1820/* === Characters Type APIs =============================================== */
1821
1822/* Helper array used by Py_UNICODE_ISSPACE(). */
1823
1824#ifndef Py_LIMITED_API
1825PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1826
1827/* These should not be used directly. Use the Py_UNICODE_IS* and
1828   Py_UNICODE_TO* macros instead.
1829
1830   These APIs are implemented in Objects/unicodectype.c.
1831
1832*/
1833
1834PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1835    Py_UCS4 ch       /* Unicode character */
1836    );
1837
1838PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1839    Py_UCS4 ch       /* Unicode character */
1840    );
1841
1842PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1843    Py_UCS4 ch       /* Unicode character */
1844    );
1845
1846PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1847    Py_UCS4 ch       /* Unicode character */
1848    );
1849
1850PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1851    Py_UCS4 ch       /* Unicode character */
1852    );
1853
1854PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1855    const Py_UCS4 ch         /* Unicode character */
1856    );
1857
1858PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1859    const Py_UCS4 ch         /* Unicode character */
1860    );
1861
1862PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1863    Py_UCS4 ch       /* Unicode character */
1864    );
1865
1866PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1867    Py_UCS4 ch       /* Unicode character */
1868    );
1869
1870PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1871    Py_UCS4 ch       /* Unicode character */
1872    );
1873
1874PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1875    Py_UCS4 ch       /* Unicode character */
1876    );
1877
1878PyAPI_FUNC(int) _PyUnicode_ToDigit(
1879    Py_UCS4 ch       /* Unicode character */
1880    );
1881
1882PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1883    Py_UCS4 ch       /* Unicode character */
1884    );
1885
1886PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1887    Py_UCS4 ch       /* Unicode character */
1888    );
1889
1890PyAPI_FUNC(int) _PyUnicode_IsDigit(
1891    Py_UCS4 ch       /* Unicode character */
1892    );
1893
1894PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1895    Py_UCS4 ch       /* Unicode character */
1896    );
1897
1898PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1899    Py_UCS4 ch       /* Unicode character */
1900    );
1901
1902PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1903    Py_UCS4 ch       /* Unicode character */
1904    );
1905
1906PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1907    const Py_UNICODE *u
1908    );
1909
1910PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1911    Py_UNICODE *s1,
1912    const Py_UNICODE *s2);
1913
1914PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1915    Py_UNICODE *s1, const Py_UNICODE *s2);
1916
1917PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1918    Py_UNICODE *s1,
1919    const Py_UNICODE *s2,
1920    size_t n);
1921
1922PyAPI_FUNC(int) Py_UNICODE_strcmp(
1923    const Py_UNICODE *s1,
1924    const Py_UNICODE *s2
1925    );
1926
1927PyAPI_FUNC(int) Py_UNICODE_strncmp(
1928    const Py_UNICODE *s1,
1929    const Py_UNICODE *s2,
1930    size_t n
1931    );
1932
1933PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1934    const Py_UNICODE *s,
1935    Py_UNICODE c
1936    );
1937
1938PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
1939    const Py_UNICODE *s,
1940    Py_UNICODE c
1941    );
1942
1943PyAPI_FUNC(size_t) Py_UCS4_strlen(
1944    const Py_UCS4 *u
1945    );
1946
1947PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1948    Py_UCS4 *s1,
1949    const Py_UCS4 *s2);
1950
1951PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1952    Py_UCS4 *s1, const Py_UCS4 *s2);
1953
1954PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
1955    Py_UCS4 *s1,
1956    const Py_UCS4 *s2,
1957    size_t n);
1958
1959PyAPI_FUNC(int) Py_UCS4_strcmp(
1960    const Py_UCS4 *s1,
1961    const Py_UCS4 *s2
1962    );
1963
1964PyAPI_FUNC(int) Py_UCS4_strncmp(
1965    const Py_UCS4 *s1,
1966    const Py_UCS4 *s2,
1967    size_t n
1968    );
1969
1970PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
1971    const Py_UCS4 *s,
1972    Py_UCS4 c
1973    );
1974
1975PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
1976    const Py_UCS4 *s,
1977    Py_UCS4 c
1978    );
1979
1980/* Create a copy of a unicode string ending with a nul character. Return NULL
1981   and raise a MemoryError exception on memory allocation failure, otherwise
1982   return a new allocated buffer (use PyMem_Free() to free the buffer). */
1983
1984PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
1985    PyObject *unicode
1986    );
1987#endif /* Py_LIMITED_API */
1988
1989#ifdef __cplusplus
1990}
1991#endif
1992#endif /* !Py_UNICODEOBJECT_H */
1993