unicodeobject.h revision 02b75abf731831e32bbb8007a3278c14f6ad700a
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
12
13Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python.  This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58#include <ctype.h>
59
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
64/* Python 3.x requires unicode */
65#define Py_USING_UNICODE
66
67#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
69#endif
70
71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74   Otherwise, Unicode strings are stored as UCS-2 (with limited support
75   for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Py_UNICODE was the native Unicode storage format (code unit) used by
87   Python and represents a single Unicode element in the Unicode type.
88   With PEP 393, Py_UNICODE is deprecated and replaced with a
89   typedef to wchar_t. */
90
91#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97   through the interface functions PyUnicode_FromWideChar(),
98   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
101# ifndef HAVE_WCHAR_H
102#  define HAVE_WCHAR_H
103# endif
104#endif
105
106#if defined(MS_WINDOWS)
107#  define HAVE_MBCS
108#endif
109
110#ifdef HAVE_WCHAR_H
111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113#  include <time.h>
114# endif
115#  include <wchar.h>
116#endif
117
118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
119   unicode representations. */
120#if SIZEOF_INT == 4
121typedef unsigned int Py_UCS4;
122#elif SIZEOF_LONG == 4
123typedef unsigned long Py_UCS4;
124#else
125#error "Could not find a proper typedef for Py_UCS4"
126#endif
127
128#if SIZEOF_SHORT == 2
129typedef unsigned short Py_UCS2;
130#else
131#error "Could not find a proper typedef for Py_UCS2"
132#endif
133
134typedef unsigned char Py_UCS1;
135
136/* --- Internal Unicode Operations ---------------------------------------- */
137
138/* Since splitting on whitespace is an important use case, and
139   whitespace in most situations is solely ASCII whitespace, we
140   optimize for the common case by using a quick look-up table
141   _Py_ascii_whitespace (see below) with an inlined check.
142
143 */
144#ifndef Py_LIMITED_API
145#define Py_UNICODE_ISSPACE(ch) \
146    ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
147
148#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
149#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
150#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
151#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
152
153#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
154#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
155#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
156
157#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
158#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
159#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
160#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
161
162#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
163#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
164#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
165
166#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
167
168#define Py_UNICODE_ISALNUM(ch) \
169       (Py_UNICODE_ISALPHA(ch) || \
170    Py_UNICODE_ISDECIMAL(ch) || \
171    Py_UNICODE_ISDIGIT(ch) || \
172    Py_UNICODE_ISNUMERIC(ch))
173
174#define Py_UNICODE_COPY(target, source, length) \
175    Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
176
177#define Py_UNICODE_FILL(target, value, length) \
178    do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
179    for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
180    } while (0)
181
182/* macros to work with surrogates */
183#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
184#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
185#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
186/* Join two surrogate characters and return a single Py_UCS4 value. */
187#define Py_UNICODE_JOIN_SURROGATES(high, low)  \
188    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
189      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
190/* high surrogate = top 10 bits added to D800 */
191#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
192/* low surrogate = bottom 10 bits added to DC00 */
193#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
194
195/* Check if substring matches at given offset.  The offset must be
196   valid, and the substring must not be empty. */
197
198#define Py_UNICODE_MATCH(string, offset, substring) \
199    ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
200     ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
201     !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
202
203#endif /* Py_LIMITED_API */
204
205#ifdef __cplusplus
206extern "C" {
207#endif
208
209/* --- Unicode Type ------------------------------------------------------- */
210
211#ifndef Py_LIMITED_API
212
213/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
214   structure. state.ascii and state.compact are set, and the data
215   immediately follow the structure. utf8_length and wstr_length can be found
216   in the length field; the utf8 pointer is equal to the data pointer. */
217typedef struct {
218    /* There are 4 forms of Unicode strings:
219
220       - compact ascii:
221
222         * structure = PyASCIIObject
223         * test: PyUnicode_IS_COMPACT_ASCII(op)
224         * kind = PyUnicode_1BYTE_KIND
225         * compact = 1
226         * ascii = 1
227         * ready = 1
228         * (length is the length of the utf8 and wstr strings)
229         * (data starts just after the structure)
230         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
231
232       - compact:
233
234         * structure = PyCompactUnicodeObject
235         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
236         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
237           PyUnicode_4BYTE_KIND
238         * compact = 1
239         * ready = 1
240         * ascii = 0
241         * utf8 is not shared with data
242         * utf8_length = 0 if utf8 is NULL
243         * wstr is shared with data and wstr_length=length
244           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
245           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
246         * wstr_length = 0 if wstr is NULL
247         * (data starts just after the structure)
248
249       - legacy string, not ready:
250
251         * structure = PyUnicodeObject
252         * test: kind == PyUnicode_WCHAR_KIND
253         * length = 0 (use wstr_length)
254         * hash = -1
255         * kind = PyUnicode_WCHAR_KIND
256         * compact = 0
257         * ascii = 0
258         * ready = 0
259         * interned = SSTATE_NOT_INTERNED
260         * wstr is not NULL
261         * data.any is NULL
262         * utf8 is NULL
263         * utf8_length = 0
264
265       - legacy string, ready:
266
267         * structure = PyUnicodeObject structure
268         * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
269         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
270           PyUnicode_4BYTE_KIND
271         * compact = 0
272         * ready = 1
273         * data.any is not NULL
274         * utf8 is shared and utf8_length = length with data.any if ascii = 1
275         * utf8_length = 0 if utf8 is NULL
276         * wstr is shared with data.any and wstr_length = length
277           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
278           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
279         * wstr_length = 0 if wstr is NULL
280
281       Compact strings use only one memory block (structure + characters),
282       whereas legacy strings use one block for the structure and one block
283       for characters.
284
285       Legacy strings are created by PyUnicode_FromUnicode() and
286       PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
287       when PyUnicode_READY() is called.
288
289       See also _PyUnicode_CheckConsistency().
290    */
291    PyObject_HEAD
292    Py_ssize_t length;          /* Number of code points in the string */
293    Py_hash_t hash;             /* Hash value; -1 if not set */
294    struct {
295        /*
296           SSTATE_NOT_INTERNED (0)
297           SSTATE_INTERNED_MORTAL (1)
298           SSTATE_INTERNED_IMMORTAL (2)
299
300           If interned != SSTATE_NOT_INTERNED, the two references from the
301           dictionary to this object are *not* counted in ob_refcnt.
302         */
303        unsigned int interned:2;
304        /* Character size:
305
306           - PyUnicode_WCHAR_KIND (0):
307
308             * character type = wchar_t (16 or 32 bits, depending on the
309               platform)
310
311           - PyUnicode_1BYTE_KIND (1):
312
313             * character type = Py_UCS1 (8 bits, unsigned)
314             * all characters are in the range U+0000-U+00FF (latin1)
315             * if ascii is set, all characters are in the range U+0000-U+007F
316               (ASCII), otherwise at least one character is in the range
317               U+0080-U+00FF
318
319           - PyUnicode_2BYTE_KIND (2):
320
321             * character type = Py_UCS2 (16 bits, unsigned)
322             * all characters are in the range U+0000-U+FFFF (BMP)
323             * at least one character is in the range U+0100-U+FFFF
324
325           - PyUnicode_4BYTE_KIND (4):
326
327             * character type = Py_UCS4 (32 bits, unsigned)
328             * all characters are in the range U+0000-U+10FFFF
329             * at least one character is in the range U+10000-U+10FFFF
330         */
331        unsigned int kind:3;
332        /* Compact is with respect to the allocation scheme. Compact unicode
333           objects only require one memory block while non-compact objects use
334           one block for the PyUnicodeObject struct and another for its data
335           buffer. */
336        unsigned int compact:1;
337        /* The string only contains characters in the range U+0000-U+007F (ASCII)
338           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
339           set, use the PyASCIIObject structure. */
340        unsigned int ascii:1;
341        /* The ready flag indicates whether the object layout is initialized
342           completely. This means that this is either a compact object, or
343           the data pointer is filled out. The bit is redundant, and helps
344           to minimize the test in PyUnicode_IS_READY(). */
345        unsigned int ready:1;
346        /* Padding to ensure that PyUnicode_DATA() is always aligned to
347           4 bytes (see issue #19537 on m68k). */
348        unsigned int :24;
349    } state;
350    wchar_t *wstr;              /* wchar_t representation (null-terminated) */
351} PyASCIIObject;
352
353/* Non-ASCII strings allocated through PyUnicode_New use the
354   PyCompactUnicodeObject structure. state.compact is set, and the data
355   immediately follow the structure. */
356typedef struct {
357    PyASCIIObject _base;
358    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
359                                 * terminating \0. */
360    char *utf8;                 /* UTF-8 representation (null-terminated) */
361    Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
362                                 * surrogates count as two code points. */
363} PyCompactUnicodeObject;
364
365/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
366   PyUnicodeObject structure. The actual string data is initially in the wstr
367   block, and copied into the data block using _PyUnicode_Ready. */
368typedef struct {
369    PyCompactUnicodeObject _base;
370    union {
371        void *any;
372        Py_UCS1 *latin1;
373        Py_UCS2 *ucs2;
374        Py_UCS4 *ucs4;
375    } data;                     /* Canonical, smallest-form Unicode buffer */
376} PyUnicodeObject;
377#endif
378
379PyAPI_DATA(PyTypeObject) PyUnicode_Type;
380PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
381
382#define PyUnicode_Check(op) \
383                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
384#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
385
386/* Fast access macros */
387#ifndef Py_LIMITED_API
388
389#define PyUnicode_WSTR_LENGTH(op) \
390    (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
391     ((PyASCIIObject*)op)->length :                    \
392     ((PyCompactUnicodeObject*)op)->wstr_length)
393
394/* Returns the deprecated Py_UNICODE representation's size in code units
395   (this includes surrogate pairs as 2 units).
396   If the Py_UNICODE representation is not available, it will be computed
397   on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
398
399#define PyUnicode_GET_SIZE(op)                       \
400    (assert(PyUnicode_Check(op)),                    \
401     (((PyASCIIObject *)(op))->wstr) ?               \
402      PyUnicode_WSTR_LENGTH(op) :                    \
403      ((void)PyUnicode_AsUnicode((PyObject *)(op)),  \
404       assert(((PyASCIIObject *)(op))->wstr),        \
405       PyUnicode_WSTR_LENGTH(op)))
406
407#define PyUnicode_GET_DATA_SIZE(op) \
408    (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
409
410/* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
411   representation on demand.  Using this macro is very inefficient now,
412   try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
413   use PyUnicode_WRITE() and PyUnicode_READ(). */
414
415#define PyUnicode_AS_UNICODE(op) \
416    (assert(PyUnicode_Check(op)), \
417     (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
418      PyUnicode_AsUnicode((PyObject *)(op)))
419
420#define PyUnicode_AS_DATA(op) \
421    ((const char *)(PyUnicode_AS_UNICODE(op)))
422
423
424/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
425
426/* Values for PyASCIIObject.state: */
427
428/* Interning state. */
429#define SSTATE_NOT_INTERNED 0
430#define SSTATE_INTERNED_MORTAL 1
431#define SSTATE_INTERNED_IMMORTAL 2
432
433/* Return true if the string contains only ASCII characters, or 0 if not. The
434   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
435   ready. */
436#define PyUnicode_IS_ASCII(op)                   \
437    (assert(PyUnicode_Check(op)),                \
438     assert(PyUnicode_IS_READY(op)),             \
439     ((PyASCIIObject*)op)->state.ascii)
440
441/* Return true if the string is compact or 0 if not.
442   No type checks or Ready calls are performed. */
443#define PyUnicode_IS_COMPACT(op) \
444    (((PyASCIIObject*)(op))->state.compact)
445
446/* Return true if the string is a compact ASCII string (use PyASCIIObject
447   structure), or 0 if not.  No type checks or Ready calls are performed. */
448#define PyUnicode_IS_COMPACT_ASCII(op)                 \
449    (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
450
451enum PyUnicode_Kind {
452/* String contains only wstr byte characters.  This is only possible
453   when the string was created with a legacy API and _PyUnicode_Ready()
454   has not been called yet.  */
455    PyUnicode_WCHAR_KIND = 0,
456/* Return values of the PyUnicode_KIND() macro: */
457    PyUnicode_1BYTE_KIND = 1,
458    PyUnicode_2BYTE_KIND = 2,
459    PyUnicode_4BYTE_KIND = 4
460};
461
462/* Return pointers to the canonical representation cast to unsigned char,
463   Py_UCS2, or Py_UCS4 for direct character access.
464   No checks are performed, use PyUnicode_KIND() before to ensure
465   these will work correctly. */
466
467#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
468#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
469#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
470
471/* Return one of the PyUnicode_*_KIND values defined above. */
472#define PyUnicode_KIND(op) \
473    (assert(PyUnicode_Check(op)), \
474     assert(PyUnicode_IS_READY(op)),            \
475     ((PyASCIIObject *)(op))->state.kind)
476
477/* Return a void pointer to the raw unicode buffer. */
478#define _PyUnicode_COMPACT_DATA(op)                     \
479    (PyUnicode_IS_ASCII(op) ?                   \
480     ((void*)((PyASCIIObject*)(op) + 1)) :              \
481     ((void*)((PyCompactUnicodeObject*)(op) + 1)))
482
483#define _PyUnicode_NONCOMPACT_DATA(op)                  \
484    (assert(((PyUnicodeObject*)(op))->data.any),        \
485     ((((PyUnicodeObject *)(op))->data.any)))
486
487#define PyUnicode_DATA(op) \
488    (assert(PyUnicode_Check(op)), \
489     PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) :   \
490     _PyUnicode_NONCOMPACT_DATA(op))
491
492/* In the access macros below, "kind" may be evaluated more than once.
493   All other macro parameters are evaluated exactly once, so it is safe
494   to put side effects into them (such as increasing the index). */
495
496/* Write into the canonical representation, this macro does not do any sanity
497   checks and is intended for usage in loops.  The caller should cache the
498   kind and data pointers obtained from other macro calls.
499   index is the index in the string (starts at 0) and value is the new
500   code point value which should be written to that location. */
501#define PyUnicode_WRITE(kind, data, index, value) \
502    do { \
503        switch ((kind)) { \
504        case PyUnicode_1BYTE_KIND: { \
505            ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
506            break; \
507        } \
508        case PyUnicode_2BYTE_KIND: { \
509            ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
510            break; \
511        } \
512        default: { \
513            assert((kind) == PyUnicode_4BYTE_KIND); \
514            ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
515        } \
516        } \
517    } while (0)
518
519/* Read a code point from the string's canonical representation.  No checks
520   or ready calls are performed. */
521#define PyUnicode_READ(kind, data, index) \
522    ((Py_UCS4) \
523    ((kind) == PyUnicode_1BYTE_KIND ? \
524        ((const Py_UCS1 *)(data))[(index)] : \
525        ((kind) == PyUnicode_2BYTE_KIND ? \
526            ((const Py_UCS2 *)(data))[(index)] : \
527            ((const Py_UCS4 *)(data))[(index)] \
528        ) \
529    ))
530
531/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
532   calls PyUnicode_KIND() and might call it twice.  For single reads, use
533   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
534   cache kind and use PyUnicode_READ instead. */
535#define PyUnicode_READ_CHAR(unicode, index) \
536    (assert(PyUnicode_Check(unicode)),          \
537     assert(PyUnicode_IS_READY(unicode)),       \
538     (Py_UCS4)                                  \
539        (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
540            ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
541            (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
542                ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
543                ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
544            ) \
545        ))
546
547/* Returns the length of the unicode string. The caller has to make sure that
548   the string has it's canonical representation set before calling
549   this macro.  Call PyUnicode_(FAST_)Ready to ensure that. */
550#define PyUnicode_GET_LENGTH(op)                \
551    (assert(PyUnicode_Check(op)),               \
552     assert(PyUnicode_IS_READY(op)),            \
553     ((PyASCIIObject *)(op))->length)
554
555
556/* Fast check to determine whether an object is ready. Equivalent to
557   PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
558
559#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
560
561/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
562   case.  If the canonical representation is not yet set, it will still call
563   _PyUnicode_Ready().
564   Returns 0 on success and -1 on errors. */
565#define PyUnicode_READY(op)                        \
566    (assert(PyUnicode_Check(op)),                       \
567     (PyUnicode_IS_READY(op) ?                          \
568      0 : _PyUnicode_Ready((PyObject *)(op))))
569
570/* Return a maximum character value which is suitable for creating another
571   string based on op.  This is always an approximation but more efficient
572   than iterating over the string. */
573#define PyUnicode_MAX_CHAR_VALUE(op) \
574    (assert(PyUnicode_IS_READY(op)),                                    \
575     (PyUnicode_IS_ASCII(op) ?                                          \
576      (0x7f) :                                                          \
577      (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ?                     \
578       (0xffU) :                                                        \
579       (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ?                    \
580        (0xffffU) :                                                     \
581        (0x10ffffU)))))
582
583#endif
584
585/* --- Constants ---------------------------------------------------------- */
586
587/* This Unicode character will be used as replacement character during
588   decoding if the errors argument is set to "replace". Note: the
589   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
590   Unicode 3.0. */
591
592#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
593
594/* === Public API ========================================================= */
595
596/* --- Plain Py_UNICODE --------------------------------------------------- */
597
598/* With PEP 393, this is the recommended way to allocate a new unicode object.
599   This function will allocate the object and its buffer in a single memory
600   block.  Objects created using this function are not resizable. */
601#ifndef Py_LIMITED_API
602PyAPI_FUNC(PyObject*) PyUnicode_New(
603    Py_ssize_t size,            /* Number of code points in the new string */
604    Py_UCS4 maxchar             /* maximum code point value in the string */
605    );
606#endif
607
608/* Initializes the canonical string representation from the deprecated
609   wstr/Py_UNICODE representation. This function is used to convert Unicode
610   objects which were created using the old API to the new flexible format
611   introduced with PEP 393.
612
613   Don't call this function directly, use the public PyUnicode_READY() macro
614   instead. */
615#ifndef Py_LIMITED_API
616PyAPI_FUNC(int) _PyUnicode_Ready(
617    PyObject *unicode           /* Unicode object */
618    );
619#endif
620
621/* Get a copy of a Unicode string. */
622#ifndef Py_LIMITED_API
623PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
624    PyObject *unicode
625    );
626#endif
627
628/* Copy character from one unicode object into another, this function performs
629   character conversion when necessary and falls back to memcpy() if possible.
630
631   Fail if to is too small (smaller than *how_many* or smaller than
632   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
633   kind(to), or if *to* has more than 1 reference.
634
635   Return the number of written character, or return -1 and raise an exception
636   on error.
637
638   Pseudo-code:
639
640       how_many = min(how_many, len(from) - from_start)
641       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
642       return how_many
643
644   Note: The function doesn't write a terminating null character.
645   */
646#ifndef Py_LIMITED_API
647PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
648    PyObject *to,
649    Py_ssize_t to_start,
650    PyObject *from,
651    Py_ssize_t from_start,
652    Py_ssize_t how_many
653    );
654
655/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
656   may crash if parameters are invalid (e.g. if the output string
657   is too short). */
658PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
659    PyObject *to,
660    Py_ssize_t to_start,
661    PyObject *from,
662    Py_ssize_t from_start,
663    Py_ssize_t how_many
664    );
665#endif
666
667#ifndef Py_LIMITED_API
668/* Fill a string with a character: write fill_char into
669   unicode[start:start+length].
670
671   Fail if fill_char is bigger than the string maximum character, or if the
672   string has more than 1 reference.
673
674   Return the number of written character, or return -1 and raise an exception
675   on error. */
676PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
677    PyObject *unicode,
678    Py_ssize_t start,
679    Py_ssize_t length,
680    Py_UCS4 fill_char
681    );
682
683/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
684   if parameters are invalid (e.g. if length is longer than the string). */
685PyAPI_FUNC(void) _PyUnicode_FastFill(
686    PyObject *unicode,
687    Py_ssize_t start,
688    Py_ssize_t length,
689    Py_UCS4 fill_char
690    );
691#endif
692
693/* Create a Unicode Object from the Py_UNICODE buffer u of the given
694   size.
695
696   u may be NULL which causes the contents to be undefined. It is the
697   user's responsibility to fill in the needed data afterwards. Note
698   that modifying the Unicode object contents after construction is
699   only allowed if u was set to NULL.
700
701   The buffer is copied into the new object. */
702
703#ifndef Py_LIMITED_API
704PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
705    const Py_UNICODE *u,        /* Unicode buffer */
706    Py_ssize_t size             /* size of buffer */
707    );
708#endif
709
710/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
711PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
712    const char *u,             /* UTF-8 encoded string */
713    Py_ssize_t size            /* size of buffer */
714    );
715
716/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
717   UTF-8 encoded bytes.  The size is determined with strlen(). */
718PyAPI_FUNC(PyObject*) PyUnicode_FromString(
719    const char *u              /* UTF-8 encoded string */
720    );
721
722#ifndef Py_LIMITED_API
723/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
724   Scan the string to find the maximum character. */
725PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
726    int kind,
727    const void *buffer,
728    Py_ssize_t size);
729
730/* Create a new string from a buffer of ASCII characters.
731   WARNING: Don't check if the string contains any non-ASCII character. */
732PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
733    const char *buffer,
734    Py_ssize_t size);
735#endif
736
737PyAPI_FUNC(PyObject*) PyUnicode_Substring(
738    PyObject *str,
739    Py_ssize_t start,
740    Py_ssize_t end);
741
742#ifndef Py_LIMITED_API
743/* Compute the maximum character of the substring unicode[start:end].
744   Return 127 for an empty string. */
745PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
746    PyObject *unicode,
747    Py_ssize_t start,
748    Py_ssize_t end);
749#endif
750
751/* Copy the string into a UCS4 buffer including the null character if copy_null
752   is set. Return NULL and raise an exception on error. Raise a ValueError if
753   the buffer is smaller than the string. Return buffer on success.
754
755   buflen is the length of the buffer in (Py_UCS4) characters. */
756PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
757    PyObject *unicode,
758    Py_UCS4* buffer,
759    Py_ssize_t buflen,
760    int copy_null);
761
762/* Copy the string into a UCS4 buffer. A new buffer is allocated using
763 * PyMem_Malloc; if this fails, NULL is returned with a memory error
764   exception set. */
765PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
766
767/* Return a read-only pointer to the Unicode object's internal
768   Py_UNICODE buffer.
769   If the wchar_t/Py_UNICODE representation is not yet available, this
770   function will calculate it. */
771
772#ifndef Py_LIMITED_API
773PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
774    PyObject *unicode           /* Unicode object */
775    );
776#endif
777
778/* Return a read-only pointer to the Unicode object's internal
779   Py_UNICODE buffer and save the length at size.
780   If the wchar_t/Py_UNICODE representation is not yet available, this
781   function will calculate it. */
782
783#ifndef Py_LIMITED_API
784PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
785    PyObject *unicode,          /* Unicode object */
786    Py_ssize_t *size            /* location where to save the length */
787    );
788#endif
789
790/* Get the length of the Unicode object. */
791
792PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
793    PyObject *unicode
794);
795
796/* Get the number of Py_UNICODE units in the
797   string representation. */
798
799PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
800    PyObject *unicode           /* Unicode object */
801    );
802
803/* Read a character from the string. */
804
805PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
806    PyObject *unicode,
807    Py_ssize_t index
808    );
809
810/* Write a character to the string. The string must have been created through
811   PyUnicode_New, must not be shared, and must not have been hashed yet.
812
813   Return 0 on success, -1 on error. */
814
815PyAPI_FUNC(int) PyUnicode_WriteChar(
816    PyObject *unicode,
817    Py_ssize_t index,
818    Py_UCS4 character
819    );
820
821#ifndef Py_LIMITED_API
822/* Get the maximum ordinal for a Unicode character. */
823PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
824#endif
825
826/* Resize a Unicode object. The length is the number of characters, except
827   if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
828   is the number of Py_UNICODE characters.
829
830   *unicode is modified to point to the new (resized) object and 0
831   returned on success.
832
833   Try to resize the string in place (which is usually faster than allocating
834   a new string and copy characters), or create a new string.
835
836   Error handling is implemented as follows: an exception is set, -1
837   is returned and *unicode left untouched.
838
839   WARNING: The function doesn't check string content, the result may not be a
840            string in canonical representation. */
841
842PyAPI_FUNC(int) PyUnicode_Resize(
843    PyObject **unicode,         /* Pointer to the Unicode object */
844    Py_ssize_t length           /* New length */
845    );
846
847/* Decode obj to a Unicode object.
848
849   bytes, bytearray and other bytes-like objects are decoded according to the
850   given encoding and error handler. The encoding and error handler can be
851   NULL to have the interface use UTF-8 and "strict".
852
853   All other objects (including Unicode objects) raise an exception.
854
855   The API returns NULL in case of an error. The caller is responsible
856   for decref'ing the returned objects.
857
858*/
859
860PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
861    PyObject *obj,              /* Object */
862    const char *encoding,       /* encoding */
863    const char *errors          /* error handling */
864    );
865
866/* Copy an instance of a Unicode subtype to a new true Unicode object if
867   necessary. If obj is already a true Unicode object (not a subtype), return
868   the reference with *incremented* refcount.
869
870   The API returns NULL in case of an error. The caller is responsible
871   for decref'ing the returned objects.
872
873*/
874
875PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
876    PyObject *obj      /* Object */
877    );
878
879PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
880    const char *format,   /* ASCII-encoded string  */
881    va_list vargs
882    );
883PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
884    const char *format,   /* ASCII-encoded string  */
885    ...
886    );
887
888#ifndef Py_LIMITED_API
889typedef struct {
890    PyObject *buffer;
891    void *data;
892    enum PyUnicode_Kind kind;
893    Py_UCS4 maxchar;
894    Py_ssize_t size;
895    Py_ssize_t pos;
896
897    /* minimum number of allocated characters (default: 0) */
898    Py_ssize_t min_length;
899
900    /* minimum character (default: 127, ASCII) */
901    Py_UCS4 min_char;
902
903    /* If non-zero, overallocate the buffer (default: 0). */
904    unsigned char overallocate;
905
906    /* If readonly is 1, buffer is a shared string (cannot be modified)
907       and size is set to 0. */
908    unsigned char readonly;
909} _PyUnicodeWriter ;
910
911/* Initialize a Unicode writer.
912 *
913 * By default, the minimum buffer size is 0 character and overallocation is
914 * disabled. Set min_length, min_char and overallocate attributes to control
915 * the allocation of the buffer. */
916PyAPI_FUNC(void)
917_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
918
919/* Prepare the buffer to write 'length' characters
920   with the specified maximum character.
921
922   Return 0 on success, raise an exception and return -1 on error. */
923#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
924    (((MAXCHAR) <= (WRITER)->maxchar                                  \
925      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
926     ? 0                                                              \
927     : (((LENGTH) == 0)                                               \
928        ? 0                                                           \
929        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
930
931/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
932   instead. */
933PyAPI_FUNC(int)
934_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
935                                 Py_ssize_t length, Py_UCS4 maxchar);
936
937/* Prepare the buffer to have at least the kind KIND.
938   For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
939   support characters in range U+000-U+FFFF.
940
941   Return 0 on success, raise an exception and return -1 on error. */
942#define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
943    (assert((KIND) != PyUnicode_WCHAR_KIND),                          \
944     (KIND) <= (WRITER)->kind                                         \
945     ? 0                                                              \
946     : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
947
948/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
949   macro instead. */
950PyAPI_FUNC(int)
951_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
952                                     enum PyUnicode_Kind kind);
953
954/* Append a Unicode character.
955   Return 0 on success, raise an exception and return -1 on error. */
956PyAPI_FUNC(int)
957_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
958    Py_UCS4 ch
959    );
960
961/* Append a Unicode string.
962   Return 0 on success, raise an exception and return -1 on error. */
963PyAPI_FUNC(int)
964_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
965    PyObject *str               /* Unicode string */
966    );
967
968/* Append a substring of a Unicode string.
969   Return 0 on success, raise an exception and return -1 on error. */
970PyAPI_FUNC(int)
971_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
972    PyObject *str,              /* Unicode string */
973    Py_ssize_t start,
974    Py_ssize_t end
975    );
976
977/* Append an ASCII-encoded byte string.
978   Return 0 on success, raise an exception and return -1 on error. */
979PyAPI_FUNC(int)
980_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
981    const char *str,           /* ASCII-encoded byte string */
982    Py_ssize_t len             /* number of bytes, or -1 if unknown */
983    );
984
985/* Append a latin1-encoded byte string.
986   Return 0 on success, raise an exception and return -1 on error. */
987PyAPI_FUNC(int)
988_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
989    const char *str,           /* latin1-encoded byte string */
990    Py_ssize_t len             /* length in bytes */
991    );
992
993/* Get the value of the writer as a Unicode string. Clear the
994   buffer of the writer. Raise an exception and return NULL
995   on error. */
996PyAPI_FUNC(PyObject *)
997_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
998
999/* Deallocate memory of a writer (clear its internal buffer). */
1000PyAPI_FUNC(void)
1001_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
1002#endif
1003
1004#ifndef Py_LIMITED_API
1005/* Format the object based on the format_spec, as defined in PEP 3101
1006   (Advanced String Formatting). */
1007PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
1008    _PyUnicodeWriter *writer,
1009    PyObject *obj,
1010    PyObject *format_spec,
1011    Py_ssize_t start,
1012    Py_ssize_t end);
1013#endif
1014
1015PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
1016PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
1017PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
1018    const char *u              /* UTF-8 encoded string */
1019    );
1020#ifndef Py_LIMITED_API
1021PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
1022#endif
1023
1024/* Use only if you know it's a string */
1025#define PyUnicode_CHECK_INTERNED(op) \
1026    (((PyASCIIObject *)(op))->state.interned)
1027
1028/* --- wchar_t support for platforms which support it --------------------- */
1029
1030#ifdef HAVE_WCHAR_H
1031
1032/* Create a Unicode Object from the wchar_t buffer w of the given
1033   size.
1034
1035   The buffer is copied into the new object. */
1036
1037PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
1038    const wchar_t *w,           /* wchar_t buffer */
1039    Py_ssize_t size             /* size of buffer */
1040    );
1041
1042/* Copies the Unicode Object contents into the wchar_t buffer w.  At
1043   most size wchar_t characters are copied.
1044
1045   Note that the resulting wchar_t string may or may not be
1046   0-terminated.  It is the responsibility of the caller to make sure
1047   that the wchar_t string is 0-terminated in case this is required by
1048   the application.
1049
1050   Returns the number of wchar_t characters copied (excluding a
1051   possibly trailing 0-termination character) or -1 in case of an
1052   error. */
1053
1054PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
1055    PyObject *unicode,          /* Unicode object */
1056    wchar_t *w,                 /* wchar_t buffer */
1057    Py_ssize_t size             /* size of buffer */
1058    );
1059
1060/* Convert the Unicode object to a wide character string. The output string
1061   always ends with a nul character. If size is not NULL, write the number of
1062   wide characters (excluding the null character) into *size.
1063
1064   Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
1065   on success. On error, returns NULL, *size is undefined and raises a
1066   MemoryError. */
1067
1068PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
1069    PyObject *unicode,          /* Unicode object */
1070    Py_ssize_t *size            /* number of characters of the result */
1071    );
1072
1073#ifndef Py_LIMITED_API
1074PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
1075#endif
1076
1077#endif
1078
1079/* --- Unicode ordinals --------------------------------------------------- */
1080
1081/* Create a Unicode Object from the given Unicode code point ordinal.
1082
1083   The ordinal must be in range(0x110000). A ValueError is
1084   raised in case it is not.
1085
1086*/
1087
1088PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
1089
1090/* --- Free-list management ----------------------------------------------- */
1091
1092/* Clear the free list used by the Unicode implementation.
1093
1094   This can be used to release memory used for objects on the free
1095   list back to the Python memory allocator.
1096
1097*/
1098
1099PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
1100
1101/* === Builtin Codecs =====================================================
1102
1103   Many of these APIs take two arguments encoding and errors. These
1104   parameters encoding and errors have the same semantics as the ones
1105   of the builtin str() API.
1106
1107   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
1108
1109   Error handling is set by errors which may also be set to NULL
1110   meaning to use the default handling defined for the codec. Default
1111   error handling for all builtin codecs is "strict" (ValueErrors are
1112   raised).
1113
1114   The codecs all use a similar interface. Only deviation from the
1115   generic ones are documented.
1116
1117*/
1118
1119/* --- Manage the default encoding ---------------------------------------- */
1120
1121/* Returns a pointer to the default encoding (UTF-8) of the
1122   Unicode object unicode and the size of the encoded representation
1123   in bytes stored in *size.
1124
1125   In case of an error, no *size is set.
1126
1127   This function caches the UTF-8 encoded string in the unicodeobject
1128   and subsequent calls will return the same string.  The memory is released
1129   when the unicodeobject is deallocated.
1130
1131   _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
1132   support the previous internal function with the same behaviour.
1133
1134   *** This API is for interpreter INTERNAL USE ONLY and will likely
1135   *** be removed or changed in the future.
1136
1137   *** If you need to access the Unicode object as UTF-8 bytes string,
1138   *** please use PyUnicode_AsUTF8String() instead.
1139*/
1140
1141#ifndef Py_LIMITED_API
1142PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
1143    PyObject *unicode,
1144    Py_ssize_t *size);
1145#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
1146#endif
1147
1148/* Returns a pointer to the default encoding (UTF-8) of the
1149   Unicode object unicode.
1150
1151   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1152   in the unicodeobject.
1153
1154   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1155   support the previous internal function with the same behaviour.
1156
1157   Use of this API is DEPRECATED since no size information can be
1158   extracted from the returned data.
1159
1160   *** This API is for interpreter INTERNAL USE ONLY and will likely
1161   *** be removed or changed for Python 3.1.
1162
1163   *** If you need to access the Unicode object as UTF-8 bytes string,
1164   *** please use PyUnicode_AsUTF8String() instead.
1165
1166*/
1167
1168#ifndef Py_LIMITED_API
1169PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1170#define _PyUnicode_AsString PyUnicode_AsUTF8
1171#endif
1172
1173/* Returns "utf-8".  */
1174
1175PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
1176
1177/* --- Generic Codecs ----------------------------------------------------- */
1178
1179/* Create a Unicode object by decoding the encoded string s of the
1180   given size. */
1181
1182PyAPI_FUNC(PyObject*) PyUnicode_Decode(
1183    const char *s,              /* encoded string */
1184    Py_ssize_t size,            /* size of buffer */
1185    const char *encoding,       /* encoding */
1186    const char *errors          /* error handling */
1187    );
1188
1189/* Decode a Unicode object unicode and return the result as Python
1190   object. */
1191
1192PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
1193    PyObject *unicode,          /* Unicode object */
1194    const char *encoding,       /* encoding */
1195    const char *errors          /* error handling */
1196    );
1197
1198/* Decode a Unicode object unicode and return the result as Unicode
1199   object. */
1200
1201PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
1202    PyObject *unicode,          /* Unicode object */
1203    const char *encoding,       /* encoding */
1204    const char *errors          /* error handling */
1205    );
1206
1207/* Encodes a Py_UNICODE buffer of the given size and returns a
1208   Python string object. */
1209
1210#ifndef Py_LIMITED_API
1211PyAPI_FUNC(PyObject*) PyUnicode_Encode(
1212    const Py_UNICODE *s,        /* Unicode char buffer */
1213    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
1214    const char *encoding,       /* encoding */
1215    const char *errors          /* error handling */
1216    );
1217#endif
1218
1219/* Encodes a Unicode object and returns the result as Python
1220   object. */
1221
1222PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
1223    PyObject *unicode,          /* Unicode object */
1224    const char *encoding,       /* encoding */
1225    const char *errors          /* error handling */
1226    );
1227
1228/* Encodes a Unicode object and returns the result as Python string
1229   object. */
1230
1231PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
1232    PyObject *unicode,          /* Unicode object */
1233    const char *encoding,       /* encoding */
1234    const char *errors          /* error handling */
1235    );
1236
1237/* Encodes a Unicode object and returns the result as Unicode
1238   object. */
1239
1240PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
1241    PyObject *unicode,          /* Unicode object */
1242    const char *encoding,       /* encoding */
1243    const char *errors          /* error handling */
1244    );
1245
1246/* Build an encoding map. */
1247
1248PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1249    PyObject* string            /* 256 character map */
1250   );
1251
1252/* --- UTF-7 Codecs ------------------------------------------------------- */
1253
1254PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
1255    const char *string,         /* UTF-7 encoded string */
1256    Py_ssize_t length,          /* size of string */
1257    const char *errors          /* error handling */
1258    );
1259
1260PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
1261    const char *string,         /* UTF-7 encoded string */
1262    Py_ssize_t length,          /* size of string */
1263    const char *errors,         /* error handling */
1264    Py_ssize_t *consumed        /* bytes consumed */
1265    );
1266
1267#ifndef Py_LIMITED_API
1268PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
1269    const Py_UNICODE *data,     /* Unicode char buffer */
1270    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1271    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1272    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1273    const char *errors          /* error handling */
1274    );
1275PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1276    PyObject *unicode,          /* Unicode object */
1277    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1278    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1279    const char *errors          /* error handling */
1280    );
1281#endif
1282
1283/* --- UTF-8 Codecs ------------------------------------------------------- */
1284
1285PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
1286    const char *string,         /* UTF-8 encoded string */
1287    Py_ssize_t length,          /* size of string */
1288    const char *errors          /* error handling */
1289    );
1290
1291PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
1292    const char *string,         /* UTF-8 encoded string */
1293    Py_ssize_t length,          /* size of string */
1294    const char *errors,         /* error handling */
1295    Py_ssize_t *consumed        /* bytes consumed */
1296    );
1297
1298PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
1299    PyObject *unicode           /* Unicode object */
1300    );
1301
1302#ifndef Py_LIMITED_API
1303PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1304    PyObject *unicode,
1305    const char *errors);
1306
1307PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
1308    const Py_UNICODE *data,     /* Unicode char buffer */
1309    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1310    const char *errors          /* error handling */
1311    );
1312#endif
1313
1314/* --- UTF-32 Codecs ------------------------------------------------------ */
1315
1316/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1317   the corresponding Unicode object.
1318
1319   errors (if non-NULL) defines the error handling. It defaults
1320   to "strict".
1321
1322   If byteorder is non-NULL, the decoder starts decoding using the
1323   given byte order:
1324
1325    *byteorder == -1: little endian
1326    *byteorder == 0:  native order
1327    *byteorder == 1:  big endian
1328
1329   In native mode, the first four bytes of the stream are checked for a
1330   BOM mark. If found, the BOM mark is analysed, the byte order
1331   adjusted and the BOM skipped.  In the other modes, no BOM mark
1332   interpretation is done. After completion, *byteorder is set to the
1333   current byte order at the end of input data.
1334
1335   If byteorder is NULL, the codec starts in native order mode.
1336
1337*/
1338
1339PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
1340    const char *string,         /* UTF-32 encoded string */
1341    Py_ssize_t length,          /* size of string */
1342    const char *errors,         /* error handling */
1343    int *byteorder              /* pointer to byteorder to use
1344                                   0=native;-1=LE,1=BE; updated on
1345                                   exit */
1346    );
1347
1348PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
1349    const char *string,         /* UTF-32 encoded string */
1350    Py_ssize_t length,          /* size of string */
1351    const char *errors,         /* error handling */
1352    int *byteorder,             /* pointer to byteorder to use
1353                                   0=native;-1=LE,1=BE; updated on
1354                                   exit */
1355    Py_ssize_t *consumed        /* bytes consumed */
1356    );
1357
1358/* Returns a Python string using the UTF-32 encoding in native byte
1359   order. The string always starts with a BOM mark.  */
1360
1361PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
1362    PyObject *unicode           /* Unicode object */
1363    );
1364
1365/* Returns a Python string object holding the UTF-32 encoded value of
1366   the Unicode data.
1367
1368   If byteorder is not 0, output is written according to the following
1369   byte order:
1370
1371   byteorder == -1: little endian
1372   byteorder == 0:  native byte order (writes a BOM mark)
1373   byteorder == 1:  big endian
1374
1375   If byteorder is 0, the output string will always start with the
1376   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1377   prepended.
1378
1379*/
1380
1381#ifndef Py_LIMITED_API
1382PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
1383    const Py_UNICODE *data,     /* Unicode char buffer */
1384    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1385    const char *errors,         /* error handling */
1386    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1387    );
1388PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1389    PyObject *object,           /* Unicode object */
1390    const char *errors,         /* error handling */
1391    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1392    );
1393#endif
1394
1395/* --- UTF-16 Codecs ------------------------------------------------------ */
1396
1397/* Decodes length bytes from a UTF-16 encoded buffer string and returns
1398   the corresponding Unicode object.
1399
1400   errors (if non-NULL) defines the error handling. It defaults
1401   to "strict".
1402
1403   If byteorder is non-NULL, the decoder starts decoding using the
1404   given byte order:
1405
1406    *byteorder == -1: little endian
1407    *byteorder == 0:  native order
1408    *byteorder == 1:  big endian
1409
1410   In native mode, the first two bytes of the stream are checked for a
1411   BOM mark. If found, the BOM mark is analysed, the byte order
1412   adjusted and the BOM skipped.  In the other modes, no BOM mark
1413   interpretation is done. After completion, *byteorder is set to the
1414   current byte order at the end of input data.
1415
1416   If byteorder is NULL, the codec starts in native order mode.
1417
1418*/
1419
1420PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
1421    const char *string,         /* UTF-16 encoded string */
1422    Py_ssize_t length,          /* size of string */
1423    const char *errors,         /* error handling */
1424    int *byteorder              /* pointer to byteorder to use
1425                                   0=native;-1=LE,1=BE; updated on
1426                                   exit */
1427    );
1428
1429PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
1430    const char *string,         /* UTF-16 encoded string */
1431    Py_ssize_t length,          /* size of string */
1432    const char *errors,         /* error handling */
1433    int *byteorder,             /* pointer to byteorder to use
1434                                   0=native;-1=LE,1=BE; updated on
1435                                   exit */
1436    Py_ssize_t *consumed        /* bytes consumed */
1437    );
1438
1439/* Returns a Python string using the UTF-16 encoding in native byte
1440   order. The string always starts with a BOM mark.  */
1441
1442PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
1443    PyObject *unicode           /* Unicode object */
1444    );
1445
1446/* Returns a Python string object holding the UTF-16 encoded value of
1447   the Unicode data.
1448
1449   If byteorder is not 0, output is written according to the following
1450   byte order:
1451
1452   byteorder == -1: little endian
1453   byteorder == 0:  native byte order (writes a BOM mark)
1454   byteorder == 1:  big endian
1455
1456   If byteorder is 0, the output string will always start with the
1457   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1458   prepended.
1459
1460   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1461   UCS-2. This trick makes it possible to add full UTF-16 capabilities
1462   at a later point without compromising the APIs.
1463
1464*/
1465
1466#ifndef Py_LIMITED_API
1467PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
1468    const Py_UNICODE *data,     /* Unicode char buffer */
1469    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1470    const char *errors,         /* error handling */
1471    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1472    );
1473PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1474    PyObject* unicode,          /* Unicode object */
1475    const char *errors,         /* error handling */
1476    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1477    );
1478#endif
1479
1480/* --- Unicode-Escape Codecs ---------------------------------------------- */
1481
1482PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
1483    const char *string,         /* Unicode-Escape encoded string */
1484    Py_ssize_t length,          /* size of string */
1485    const char *errors          /* error handling */
1486    );
1487
1488PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
1489    PyObject *unicode           /* Unicode object */
1490    );
1491
1492#ifndef Py_LIMITED_API
1493PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
1494    const Py_UNICODE *data,     /* Unicode char buffer */
1495    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1496    );
1497#endif
1498
1499/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1500
1501PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
1502    const char *string,         /* Raw-Unicode-Escape encoded string */
1503    Py_ssize_t length,          /* size of string */
1504    const char *errors          /* error handling */
1505    );
1506
1507PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
1508    PyObject *unicode           /* Unicode object */
1509    );
1510
1511#ifndef Py_LIMITED_API
1512PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
1513    const Py_UNICODE *data,     /* Unicode char buffer */
1514    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1515    );
1516#endif
1517
1518/* --- Unicode Internal Codec ---------------------------------------------
1519
1520    Only for internal use in _codecsmodule.c */
1521
1522#ifndef Py_LIMITED_API
1523PyObject *_PyUnicode_DecodeUnicodeInternal(
1524    const char *string,
1525    Py_ssize_t length,
1526    const char *errors
1527    );
1528#endif
1529
1530/* --- Latin-1 Codecs -----------------------------------------------------
1531
1532   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1533
1534*/
1535
1536PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1537    const char *string,         /* Latin-1 encoded string */
1538    Py_ssize_t length,          /* size of string */
1539    const char *errors          /* error handling */
1540    );
1541
1542PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1543    PyObject *unicode           /* Unicode object */
1544    );
1545
1546#ifndef Py_LIMITED_API
1547PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1548    PyObject* unicode,
1549    const char* errors);
1550
1551PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1552    const Py_UNICODE *data,     /* Unicode char buffer */
1553    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1554    const char *errors          /* error handling */
1555    );
1556#endif
1557
1558/* --- ASCII Codecs -------------------------------------------------------
1559
1560   Only 7-bit ASCII data is excepted. All other codes generate errors.
1561
1562*/
1563
1564PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1565    const char *string,         /* ASCII encoded string */
1566    Py_ssize_t length,          /* size of string */
1567    const char *errors          /* error handling */
1568    );
1569
1570PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1571    PyObject *unicode           /* Unicode object */
1572    );
1573
1574#ifndef Py_LIMITED_API
1575PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1576    PyObject* unicode,
1577    const char* errors);
1578
1579PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1580    const Py_UNICODE *data,     /* Unicode char buffer */
1581    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1582    const char *errors          /* error handling */
1583    );
1584#endif
1585
1586/* --- Character Map Codecs -----------------------------------------------
1587
1588   This codec uses mappings to encode and decode characters.
1589
1590   Decoding mappings must map single string characters to single
1591   Unicode characters, integers (which are then interpreted as Unicode
1592   ordinals) or None (meaning "undefined mapping" and causing an
1593   error).
1594
1595   Encoding mappings must map single Unicode characters to single
1596   string characters, integers (which are then interpreted as Latin-1
1597   ordinals) or None (meaning "undefined mapping" and causing an
1598   error).
1599
1600   If a character lookup fails with a LookupError, the character is
1601   copied as-is meaning that its ordinal value will be interpreted as
1602   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1603   to contain those mappings which map characters to different code
1604   points.
1605
1606*/
1607
1608PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1609    const char *string,         /* Encoded string */
1610    Py_ssize_t length,          /* size of string */
1611    PyObject *mapping,          /* character mapping
1612                                   (char ordinal -> unicode ordinal) */
1613    const char *errors          /* error handling */
1614    );
1615
1616PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1617    PyObject *unicode,          /* Unicode object */
1618    PyObject *mapping           /* character mapping
1619                                   (unicode ordinal -> char ordinal) */
1620    );
1621
1622#ifndef Py_LIMITED_API
1623PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1624    const Py_UNICODE *data,     /* Unicode char buffer */
1625    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1626    PyObject *mapping,          /* character mapping
1627                                   (unicode ordinal -> char ordinal) */
1628    const char *errors          /* error handling */
1629    );
1630PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1631    PyObject *unicode,          /* Unicode object */
1632    PyObject *mapping,          /* character mapping
1633                                   (unicode ordinal -> char ordinal) */
1634    const char *errors          /* error handling */
1635    );
1636#endif
1637
1638/* Translate a Py_UNICODE buffer of the given length by applying a
1639   character mapping table to it and return the resulting Unicode
1640   object.
1641
1642   The mapping table must map Unicode ordinal integers to Unicode
1643   ordinal integers or None (causing deletion of the character).
1644
1645   Mapping tables may be dictionaries or sequences. Unmapped character
1646   ordinals (ones which cause a LookupError) are left untouched and
1647   are copied as-is.
1648
1649*/
1650
1651#ifndef Py_LIMITED_API
1652PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1653    const Py_UNICODE *data,     /* Unicode char buffer */
1654    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1655    PyObject *table,            /* Translate table */
1656    const char *errors          /* error handling */
1657    );
1658#endif
1659
1660#ifdef HAVE_MBCS
1661
1662/* --- MBCS codecs for Windows -------------------------------------------- */
1663
1664PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1665    const char *string,         /* MBCS encoded string */
1666    Py_ssize_t length,              /* size of string */
1667    const char *errors          /* error handling */
1668    );
1669
1670PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1671    const char *string,         /* MBCS encoded string */
1672    Py_ssize_t length,          /* size of string */
1673    const char *errors,         /* error handling */
1674    Py_ssize_t *consumed        /* bytes consumed */
1675    );
1676
1677PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1678    int code_page,              /* code page number */
1679    const char *string,         /* encoded string */
1680    Py_ssize_t length,          /* size of string */
1681    const char *errors,         /* error handling */
1682    Py_ssize_t *consumed        /* bytes consumed */
1683    );
1684
1685PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1686    PyObject *unicode           /* Unicode object */
1687    );
1688
1689#ifndef Py_LIMITED_API
1690PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1691    const Py_UNICODE *data,     /* Unicode char buffer */
1692    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1693    const char *errors          /* error handling */
1694    );
1695#endif
1696
1697PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1698    int code_page,              /* code page number */
1699    PyObject *unicode,          /* Unicode object */
1700    const char *errors          /* error handling */
1701    );
1702
1703#endif /* HAVE_MBCS */
1704
1705/* --- Decimal Encoder ---------------------------------------------------- */
1706
1707/* Takes a Unicode string holding a decimal value and writes it into
1708   an output buffer using standard ASCII digit codes.
1709
1710   The output buffer has to provide at least length+1 bytes of storage
1711   area. The output string is 0-terminated.
1712
1713   The encoder converts whitespace to ' ', decimal characters to their
1714   corresponding ASCII digit and all other Latin-1 characters except
1715   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1716   are treated as errors. This includes embedded NULL bytes.
1717
1718   Error handling is defined by the errors argument:
1719
1720      NULL or "strict": raise a ValueError
1721      "ignore": ignore the wrong characters (these are not copied to the
1722                output buffer)
1723      "replace": replaces illegal characters with '?'
1724
1725   Returns 0 on success, -1 on failure.
1726
1727*/
1728
1729#ifndef Py_LIMITED_API
1730PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1731    Py_UNICODE *s,              /* Unicode buffer */
1732    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1733    char *output,               /* Output buffer; must have size >= length */
1734    const char *errors          /* error handling */
1735    );
1736#endif
1737
1738/* Transforms code points that have decimal digit property to the
1739   corresponding ASCII digit code points.
1740
1741   Returns a new Unicode string on success, NULL on failure.
1742*/
1743
1744#ifndef Py_LIMITED_API
1745PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1746    Py_UNICODE *s,              /* Unicode buffer */
1747    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
1748    );
1749#endif
1750
1751/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
1752   as argument instead of a raw buffer and length.  This function additionally
1753   transforms spaces to ASCII because this is what the callers in longobject,
1754   floatobject, and complexobject did anyways. */
1755
1756#ifndef Py_LIMITED_API
1757PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1758    PyObject *unicode           /* Unicode object */
1759    );
1760#endif
1761
1762/* --- Locale encoding --------------------------------------------------- */
1763
1764/* Decode a string from the current locale encoding. The decoder is strict if
1765   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1766   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1767   be decoded as a surrogate character and *surrogateescape* is not equal to
1768   zero, the byte sequence is escaped using the 'surrogateescape' error handler
1769   instead of being decoded. *str* must end with a null character but cannot
1770   contain embedded null characters. */
1771
1772PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1773    const char *str,
1774    Py_ssize_t len,
1775    const char *errors);
1776
1777/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1778   length using strlen(). */
1779
1780PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1781    const char *str,
1782    const char *errors);
1783
1784/* Encode a Unicode object to the current locale encoding. The encoder is
1785   strict is *surrogateescape* is equal to zero, otherwise the
1786   "surrogateescape" error handler is used. Return a bytes object. The string
1787   cannot contain embedded null characters. */
1788
1789PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1790    PyObject *unicode,
1791    const char *errors
1792    );
1793
1794/* --- File system encoding ---------------------------------------------- */
1795
1796/* ParseTuple converter: encode str objects to bytes using
1797   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1798
1799PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1800
1801/* ParseTuple converter: decode bytes objects to unicode using
1802   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1803
1804PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1805
1806/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1807   and the "surrogateescape" error handler.
1808
1809   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1810   encoding.
1811
1812   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1813*/
1814
1815PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1816    const char *s               /* encoded string */
1817    );
1818
1819/* Decode a string using Py_FileSystemDefaultEncoding
1820   and the "surrogateescape" error handler.
1821
1822   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1823   encoding.
1824*/
1825
1826PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1827    const char *s,               /* encoded string */
1828    Py_ssize_t size              /* size */
1829    );
1830
1831/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1832   "surrogateescape" error handler, and return bytes.
1833
1834   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1835   encoding.
1836*/
1837
1838PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1839    PyObject *unicode
1840    );
1841
1842/* --- Methods & Slots ----------------------------------------------------
1843
1844   These are capable of handling Unicode objects and strings on input
1845   (we refer to them as strings in the descriptions) and return
1846   Unicode objects or integers as appropriate. */
1847
1848/* Concat two strings giving a new Unicode string. */
1849
1850PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1851    PyObject *left,             /* Left string */
1852    PyObject *right             /* Right string */
1853    );
1854
1855/* Concat two strings and put the result in *pleft
1856   (sets *pleft to NULL on error) */
1857
1858PyAPI_FUNC(void) PyUnicode_Append(
1859    PyObject **pleft,           /* Pointer to left string */
1860    PyObject *right             /* Right string */
1861    );
1862
1863/* Concat two strings, put the result in *pleft and drop the right object
1864   (sets *pleft to NULL on error) */
1865
1866PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1867    PyObject **pleft,           /* Pointer to left string */
1868    PyObject *right             /* Right string */
1869    );
1870
1871/* Split a string giving a list of Unicode strings.
1872
1873   If sep is NULL, splitting will be done at all whitespace
1874   substrings. Otherwise, splits occur at the given separator.
1875
1876   At most maxsplit splits will be done. If negative, no limit is set.
1877
1878   Separators are not included in the resulting list.
1879
1880*/
1881
1882PyAPI_FUNC(PyObject*) PyUnicode_Split(
1883    PyObject *s,                /* String to split */
1884    PyObject *sep,              /* String separator */
1885    Py_ssize_t maxsplit         /* Maxsplit count */
1886    );
1887
1888/* Dito, but split at line breaks.
1889
1890   CRLF is considered to be one line break. Line breaks are not
1891   included in the resulting list. */
1892
1893PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1894    PyObject *s,                /* String to split */
1895    int keepends                /* If true, line end markers are included */
1896    );
1897
1898/* Partition a string using a given separator. */
1899
1900PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1901    PyObject *s,                /* String to partition */
1902    PyObject *sep               /* String separator */
1903    );
1904
1905/* Partition a string using a given separator, searching from the end of the
1906   string. */
1907
1908PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1909    PyObject *s,                /* String to partition */
1910    PyObject *sep               /* String separator */
1911    );
1912
1913/* Split a string giving a list of Unicode strings.
1914
1915   If sep is NULL, splitting will be done at all whitespace
1916   substrings. Otherwise, splits occur at the given separator.
1917
1918   At most maxsplit splits will be done. But unlike PyUnicode_Split
1919   PyUnicode_RSplit splits from the end of the string. If negative,
1920   no limit is set.
1921
1922   Separators are not included in the resulting list.
1923
1924*/
1925
1926PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1927    PyObject *s,                /* String to split */
1928    PyObject *sep,              /* String separator */
1929    Py_ssize_t maxsplit         /* Maxsplit count */
1930    );
1931
1932/* Translate a string by applying a character mapping table to it and
1933   return the resulting Unicode object.
1934
1935   The mapping table must map Unicode ordinal integers to Unicode
1936   ordinal integers or None (causing deletion of the character).
1937
1938   Mapping tables may be dictionaries or sequences. Unmapped character
1939   ordinals (ones which cause a LookupError) are left untouched and
1940   are copied as-is.
1941
1942*/
1943
1944PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1945    PyObject *str,              /* String */
1946    PyObject *table,            /* Translate table */
1947    const char *errors          /* error handling */
1948    );
1949
1950/* Join a sequence of strings using the given separator and return
1951   the resulting Unicode string. */
1952
1953PyAPI_FUNC(PyObject*) PyUnicode_Join(
1954    PyObject *separator,        /* Separator string */
1955    PyObject *seq               /* Sequence object */
1956    );
1957
1958/* Return 1 if substr matches str[start:end] at the given tail end, 0
1959   otherwise. */
1960
1961PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1962    PyObject *str,              /* String */
1963    PyObject *substr,           /* Prefix or Suffix string */
1964    Py_ssize_t start,           /* Start index */
1965    Py_ssize_t end,             /* Stop index */
1966    int direction               /* Tail end: -1 prefix, +1 suffix */
1967    );
1968
1969/* Return the first position of substr in str[start:end] using the
1970   given search direction or -1 if not found. -2 is returned in case
1971   an error occurred and an exception is set. */
1972
1973PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1974    PyObject *str,              /* String */
1975    PyObject *substr,           /* Substring to find */
1976    Py_ssize_t start,           /* Start index */
1977    Py_ssize_t end,             /* Stop index */
1978    int direction               /* Find direction: +1 forward, -1 backward */
1979    );
1980
1981/* Like PyUnicode_Find, but search for single character only. */
1982PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1983    PyObject *str,
1984    Py_UCS4 ch,
1985    Py_ssize_t start,
1986    Py_ssize_t end,
1987    int direction
1988    );
1989
1990/* Count the number of occurrences of substr in str[start:end]. */
1991
1992PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1993    PyObject *str,              /* String */
1994    PyObject *substr,           /* Substring to count */
1995    Py_ssize_t start,           /* Start index */
1996    Py_ssize_t end              /* Stop index */
1997    );
1998
1999/* Replace at most maxcount occurrences of substr in str with replstr
2000   and return the resulting Unicode object. */
2001
2002PyAPI_FUNC(PyObject *) PyUnicode_Replace(
2003    PyObject *str,              /* String */
2004    PyObject *substr,           /* Substring to find */
2005    PyObject *replstr,          /* Substring to replace */
2006    Py_ssize_t maxcount         /* Max. number of replacements to apply;
2007                                   -1 = all */
2008    );
2009
2010/* Compare two strings and return -1, 0, 1 for less than, equal,
2011   greater than resp.
2012   Raise an exception and return -1 on error. */
2013
2014PyAPI_FUNC(int) PyUnicode_Compare(
2015    PyObject *left,             /* Left string */
2016    PyObject *right             /* Right string */
2017    );
2018
2019#ifndef Py_LIMITED_API
2020PyAPI_FUNC(int) _PyUnicode_CompareWithId(
2021    PyObject *left,             /* Left string */
2022    _Py_Identifier *right       /* Right identifier */
2023    );
2024#endif
2025
2026PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
2027    PyObject *left,
2028    const char *right           /* ASCII-encoded string */
2029    );
2030
2031/* Rich compare two strings and return one of the following:
2032
2033   - NULL in case an exception was raised
2034   - Py_True or Py_False for successful comparisons
2035   - Py_NotImplemented in case the type combination is unknown
2036
2037   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
2038   case the conversion of the arguments to Unicode fails with a
2039   UnicodeDecodeError.
2040
2041   Possible values for op:
2042
2043     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
2044
2045*/
2046
2047PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
2048    PyObject *left,             /* Left string */
2049    PyObject *right,            /* Right string */
2050    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
2051    );
2052
2053/* Apply an argument tuple or dictionary to a format string and return
2054   the resulting Unicode string. */
2055
2056PyAPI_FUNC(PyObject *) PyUnicode_Format(
2057    PyObject *format,           /* Format string */
2058    PyObject *args              /* Argument tuple or dictionary */
2059    );
2060
2061/* Checks whether element is contained in container and return 1/0
2062   accordingly.
2063
2064   element has to coerce to a one element Unicode string. -1 is
2065   returned in case of an error. */
2066
2067PyAPI_FUNC(int) PyUnicode_Contains(
2068    PyObject *container,        /* Container string */
2069    PyObject *element           /* Element string */
2070    );
2071
2072/* Checks whether argument is a valid identifier. */
2073
2074PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
2075
2076#ifndef Py_LIMITED_API
2077/* Externally visible for str.strip(unicode) */
2078PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
2079    PyObject *self,
2080    int striptype,
2081    PyObject *sepobj
2082    );
2083#endif
2084
2085/* Using explicit passed-in values, insert the thousands grouping
2086   into the string pointed to by buffer.  For the argument descriptions,
2087   see Objects/stringlib/localeutil.h */
2088#ifndef Py_LIMITED_API
2089PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
2090    PyObject *unicode,
2091    Py_ssize_t index,
2092    Py_ssize_t n_buffer,
2093    void *digits,
2094    Py_ssize_t n_digits,
2095    Py_ssize_t min_width,
2096    const char *grouping,
2097    PyObject *thousands_sep,
2098    Py_UCS4 *maxchar);
2099#endif
2100/* === Characters Type APIs =============================================== */
2101
2102/* Helper array used by Py_UNICODE_ISSPACE(). */
2103
2104#ifndef Py_LIMITED_API
2105PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
2106
2107/* These should not be used directly. Use the Py_UNICODE_IS* and
2108   Py_UNICODE_TO* macros instead.
2109
2110   These APIs are implemented in Objects/unicodectype.c.
2111
2112*/
2113
2114PyAPI_FUNC(int) _PyUnicode_IsLowercase(
2115    Py_UCS4 ch       /* Unicode character */
2116    );
2117
2118PyAPI_FUNC(int) _PyUnicode_IsUppercase(
2119    Py_UCS4 ch       /* Unicode character */
2120    );
2121
2122PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
2123    Py_UCS4 ch       /* Unicode character */
2124    );
2125
2126PyAPI_FUNC(int) _PyUnicode_IsXidStart(
2127    Py_UCS4 ch       /* Unicode character */
2128    );
2129
2130PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
2131    Py_UCS4 ch       /* Unicode character */
2132    );
2133
2134PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
2135    const Py_UCS4 ch         /* Unicode character */
2136    );
2137
2138PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
2139    const Py_UCS4 ch         /* Unicode character */
2140    );
2141
2142PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2143    Py_UCS4 ch       /* Unicode character */
2144    );
2145
2146PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2147    Py_UCS4 ch       /* Unicode character */
2148    );
2149
2150PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2151    Py_UCS4 ch       /* Unicode character */
2152    );
2153
2154PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2155    Py_UCS4 ch,       /* Unicode character */
2156    Py_UCS4 *res
2157    );
2158
2159PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2160    Py_UCS4 ch,       /* Unicode character */
2161    Py_UCS4 *res
2162    );
2163
2164PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2165    Py_UCS4 ch,       /* Unicode character */
2166    Py_UCS4 *res
2167    );
2168
2169PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2170    Py_UCS4 ch,       /* Unicode character */
2171    Py_UCS4 *res
2172    );
2173
2174PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
2175    Py_UCS4 ch         /* Unicode character */
2176    );
2177
2178PyAPI_FUNC(int) _PyUnicode_IsCased(
2179    Py_UCS4 ch         /* Unicode character */
2180    );
2181
2182PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
2183    Py_UCS4 ch       /* Unicode character */
2184    );
2185
2186PyAPI_FUNC(int) _PyUnicode_ToDigit(
2187    Py_UCS4 ch       /* Unicode character */
2188    );
2189
2190PyAPI_FUNC(double) _PyUnicode_ToNumeric(
2191    Py_UCS4 ch       /* Unicode character */
2192    );
2193
2194PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
2195    Py_UCS4 ch       /* Unicode character */
2196    );
2197
2198PyAPI_FUNC(int) _PyUnicode_IsDigit(
2199    Py_UCS4 ch       /* Unicode character */
2200    );
2201
2202PyAPI_FUNC(int) _PyUnicode_IsNumeric(
2203    Py_UCS4 ch       /* Unicode character */
2204    );
2205
2206PyAPI_FUNC(int) _PyUnicode_IsPrintable(
2207    Py_UCS4 ch       /* Unicode character */
2208    );
2209
2210PyAPI_FUNC(int) _PyUnicode_IsAlpha(
2211    Py_UCS4 ch       /* Unicode character */
2212    );
2213
2214PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2215    const Py_UNICODE *u
2216    );
2217
2218PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
2219    Py_UNICODE *s1,
2220    const Py_UNICODE *s2);
2221
2222PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2223    Py_UNICODE *s1, const Py_UNICODE *s2);
2224
2225PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
2226    Py_UNICODE *s1,
2227    const Py_UNICODE *s2,
2228    size_t n);
2229
2230PyAPI_FUNC(int) Py_UNICODE_strcmp(
2231    const Py_UNICODE *s1,
2232    const Py_UNICODE *s2
2233    );
2234
2235PyAPI_FUNC(int) Py_UNICODE_strncmp(
2236    const Py_UNICODE *s1,
2237    const Py_UNICODE *s2,
2238    size_t n
2239    );
2240
2241PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
2242    const Py_UNICODE *s,
2243    Py_UNICODE c
2244    );
2245
2246PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
2247    const Py_UNICODE *s,
2248    Py_UNICODE c
2249    );
2250
2251PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
2252
2253/* Create a copy of a unicode string ending with a nul character. Return NULL
2254   and raise a MemoryError exception on memory allocation failure, otherwise
2255   return a new allocated buffer (use PyMem_Free() to free the buffer). */
2256
2257PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
2258    PyObject *unicode
2259    );
2260#endif /* Py_LIMITED_API */
2261
2262#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
2263PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
2264    PyObject *op,
2265    int check_content);
2266#endif
2267
2268/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2269PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2270/* Clear all static strings. */
2271PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2272
2273/* Fast equality check when the inputs are known to be exact unicode types
2274   and where the hash values are equal (i.e. a very probable match) */
2275PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
2276
2277#ifdef __cplusplus
2278}
2279#endif
2280#endif /* !Py_UNICODEOBJECT_H */
2281