unicodeobject.h revision b066cc6aba07a118c89f2a127560858051af4814
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4#include <stdarg.h>
5
6/*
7
8Unicode implementation based on original code by Fredrik Lundh,
9modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10Unicode Integration Proposal. (See
11http://www.egenix.com/files/python/unicode-proposal.txt).
12
13Copyright (c) Corporation for National Research Initiatives.
14
15
16 Original header:
17 --------------------------------------------------------------------
18
19 * Yet another Unicode string type for Python.  This type supports the
20 * 16-bit Basic Multilingual Plane (BMP) only.
21 *
22 * Written by Fredrik Lundh, January 1999.
23 *
24 * Copyright (c) 1999 by Secret Labs AB.
25 * Copyright (c) 1999 by Fredrik Lundh.
26 *
27 * fredrik@pythonware.com
28 * http://www.pythonware.com
29 *
30 * --------------------------------------------------------------------
31 * This Unicode String Type is
32 *
33 * Copyright (c) 1999 by Secret Labs AB
34 * Copyright (c) 1999 by Fredrik Lundh
35 *
36 * By obtaining, using, and/or copying this software and/or its
37 * associated documentation, you agree that you have read, understood,
38 * and will comply with the following terms and conditions:
39 *
40 * Permission to use, copy, modify, and distribute this software and its
41 * associated documentation for any purpose and without fee is hereby
42 * granted, provided that the above copyright notice appears in all
43 * copies, and that both that copyright notice and this permission notice
44 * appear in supporting documentation, and that the name of Secret Labs
45 * AB or the author not be used in advertising or publicity pertaining to
46 * distribution of the software without specific, written prior
47 * permission.
48 *
49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56 * -------------------------------------------------------------------- */
57
58#include <ctype.h>
59
60/* === Internal API ======================================================= */
61
62/* --- Internal Unicode Format -------------------------------------------- */
63
64/* Python 3.x requires unicode */
65#define Py_USING_UNICODE
66
67#ifndef SIZEOF_WCHAR_T
68#error Must define SIZEOF_WCHAR_T
69#endif
70
71#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72
73/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74   Otherwise, Unicode strings are stored as UCS-2 (with limited support
75   for UTF-16) */
76
77#if Py_UNICODE_SIZE >= 4
78#define Py_UNICODE_WIDE
79#endif
80
81/* Set these flags if the platform has "wchar.h" and the
82   wchar_t type is a 16-bit unsigned type */
83/* #define HAVE_WCHAR_H */
84/* #define HAVE_USABLE_WCHAR_T */
85
86/* Py_UNICODE was the native Unicode storage format (code unit) used by
87   Python and represents a single Unicode element in the Unicode type.
88   With PEP 393, Py_UNICODE is deprecated and replaced with a
89   typedef to wchar_t. */
90
91#ifndef Py_LIMITED_API
92#define PY_UNICODE_TYPE wchar_t
93typedef wchar_t Py_UNICODE;
94#endif
95
96/* If the compiler provides a wchar_t type we try to support it
97   through the interface functions PyUnicode_FromWideChar(),
98   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
99
100#ifdef HAVE_USABLE_WCHAR_T
101# ifndef HAVE_WCHAR_H
102#  define HAVE_WCHAR_H
103# endif
104#endif
105
106#if defined(MS_WINDOWS)
107#  define HAVE_MBCS
108#endif
109
110#ifdef HAVE_WCHAR_H
111/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
112# ifdef _HAVE_BSDI
113#  include <time.h>
114# endif
115#  include <wchar.h>
116#endif
117
118/* Py_UCS4 and Py_UCS2 are typedefs for the respective
119   unicode representations. */
120#if SIZEOF_INT >= 4
121typedef unsigned int Py_UCS4;
122#elif SIZEOF_LONG >= 4
123typedef unsigned long Py_UCS4;
124#else
125#error "Could not find a proper typedef for Py_UCS4"
126#endif
127
128typedef unsigned short Py_UCS2;
129typedef unsigned char Py_UCS1;
130
131/* --- Internal Unicode Operations ---------------------------------------- */
132
133/* Since splitting on whitespace is an important use case, and
134   whitespace in most situations is solely ASCII whitespace, we
135   optimize for the common case by using a quick look-up table
136   _Py_ascii_whitespace (see below) with an inlined check.
137
138 */
139#ifndef Py_LIMITED_API
140#define Py_UNICODE_ISSPACE(ch) \
141    ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
142
143#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
144#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
145#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
146#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
147
148#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
149#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
150#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
151
152#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
153#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
154#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
155#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
156
157#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
158#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
159#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
160
161#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
162
163#define Py_UNICODE_ISALNUM(ch) \
164       (Py_UNICODE_ISALPHA(ch) || \
165    Py_UNICODE_ISDECIMAL(ch) || \
166    Py_UNICODE_ISDIGIT(ch) || \
167    Py_UNICODE_ISNUMERIC(ch))
168
169#define Py_UNICODE_COPY(target, source, length) \
170    Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
171
172#define Py_UNICODE_FILL(target, value, length) \
173    do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
174    for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
175    } while (0)
176
177/* macros to work with surrogates */
178#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF)
179#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
180#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
181/* Join two surrogate characters and return a single Py_UCS4 value. */
182#define Py_UNICODE_JOIN_SURROGATES(high, low)  \
183    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
184      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
185
186/* Check if substring matches at given offset.  The offset must be
187   valid, and the substring must not be empty. */
188
189#define Py_UNICODE_MATCH(string, offset, substring) \
190    ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
191     ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
192     !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
193
194#endif /* Py_LIMITED_API */
195
196#ifdef __cplusplus
197extern "C" {
198#endif
199
200/* --- Unicode Type ------------------------------------------------------- */
201
202#ifndef Py_LIMITED_API
203
204/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
205   structure. state.ascii and state.compact are set, and the data
206   immediately follow the structure. utf8_length and wstr_length can be found
207   in the length field; the utf8 pointer is equal to the data pointer. */
208typedef struct {
209    /* There are 4 forms of Unicode strings:
210
211       - compact ascii:
212
213         * structure = PyASCIIObject
214         * kind = PyUnicode_1BYTE_KIND
215         * compact = 1
216         * ascii = 1
217         * ready = 1
218         * (length is the length of the utf8 and wstr strings)
219         * (data starts just after the structure)
220         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
221
222       - compact:
223
224         * structure = PyCompactUnicodeObject
225         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
226           PyUnicode_4BYTE_KIND
227         * compact = 1
228         * ready = 1
229         * ascii = 0
230         * utf8 is not shared with data
231         * utf8_length = 0 if utf8 is NULL
232         * wstr is shared with data and wstr_length=length
233           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
234           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
235         * wstr_length = 0 if wstr is NULL
236         * (data starts just after the structure)
237
238       - legacy string, not ready:
239
240         * structure = PyUnicodeObject
241         * kind = PyUnicode_WCHAR_KIND
242         * compact = 0
243         * ascii = 0
244         * ready = 0
245         * wstr is not NULL
246         * data.any is NULL
247         * utf8 is NULL
248         * utf8_length = 0
249         * interned = SSTATE_NOT_INTERNED
250
251       - legacy string, ready:
252
253         * structure = PyUnicodeObject structure
254         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
255           PyUnicode_4BYTE_KIND
256         * compact = 0
257         * ready = 1
258         * data.any is not NULL
259         * utf8 is shared and utf8_length = length with data.any if ascii = 1
260         * utf8_length = 0 if utf8 is NULL
261         * wstr is shared and wstr_length = length with data.any
262           if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
263           or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
264         * wstr_length = 0 if wstr is NULL
265
266       Compact strings use only one memory block (structure + characters),
267       whereas legacy strings use one block for the structure and one block
268       for characters.
269
270       Legacy strings are created by PyUnicode_FromUnicode() and
271       PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
272       when PyUnicode_READY() is called.
273
274       See also _PyUnicode_CheckConsistency().
275    */
276    PyObject_HEAD
277    Py_ssize_t length;          /* Number of code points in the string */
278    Py_hash_t hash;             /* Hash value; -1 if not set */
279    struct {
280        /*
281           SSTATE_NOT_INTERNED (0)
282           SSTATE_INTERNED_MORTAL (1)
283           SSTATE_INTERNED_IMMORTAL (2)
284
285           If interned != SSTATE_NOT_INTERNED, the two references from the
286           dictionary to this object are *not* counted in ob_refcnt.
287         */
288        unsigned int interned:2;
289        /* Character size:
290
291           - PyUnicode_WCHAR_KIND (0):
292
293             * character type = wchar_t (16 or 32 bits, depending on the
294               platform)
295
296           - PyUnicode_1BYTE_KIND (1):
297
298             * character type = Py_UCS1 (8 bits, unsigned)
299             * if ascii is set, all characters must be in range
300               U+0000-U+007F, otherwise at least one character must be in range
301               U+0080-U+00FF
302
303           - PyUnicode_2BYTE_KIND (2):
304
305             * character type = Py_UCS2 (16 bits, unsigned)
306             * at least one character must be in range U+0100-U+FFFF
307
308           - PyUnicode_4BYTE_KIND (3):
309
310             * character type = Py_UCS4 (32 bits, unsigned)
311             * at least one character must be in range U+10000-U+10FFFF
312         */
313        unsigned int kind:2;
314        /* Compact is with respect to the allocation scheme. Compact unicode
315           objects only require one memory block while non-compact objects use
316           one block for the PyUnicodeObject struct and another for its data
317           buffer. */
318        unsigned int compact:1;
319        /* The string only contains characters in range U+0000-U+007F (ASCII)
320           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
321           set, use the PyASCIIObject structure. */
322        unsigned int ascii:1;
323        /* The ready flag indicates whether the object layout is initialized
324           completely. This means that this is either a compact object, or
325           the data pointer is filled out. The bit is redundant, and helps
326           to minimize the test in PyUnicode_IS_READY(). */
327        unsigned int ready:1;
328    } state;
329    wchar_t *wstr;              /* wchar_t representation (null-terminated) */
330} PyASCIIObject;
331
332/* Non-ASCII strings allocated through PyUnicode_New use the
333   PyCompactUnicodeObject structure. state.compact is set, and the data
334   immediately follow the structure. */
335typedef struct {
336    PyASCIIObject _base;
337    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
338                                 * terminating \0. */
339    char *utf8;                 /* UTF-8 representation (null-terminated) */
340    Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
341                                 * surrogates count as two code points. */
342} PyCompactUnicodeObject;
343
344/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
345   PyUnicodeObject structure. The actual string data is initially in the wstr
346   block, and copied into the data block using _PyUnicode_Ready. */
347typedef struct {
348    PyCompactUnicodeObject _base;
349    union {
350        void *any;
351        Py_UCS1 *latin1;
352        Py_UCS2 *ucs2;
353        Py_UCS4 *ucs4;
354    } data;                     /* Canonical, smallest-form Unicode buffer */
355} PyUnicodeObject;
356#endif
357
358PyAPI_DATA(PyTypeObject) PyUnicode_Type;
359PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
360
361#define PyUnicode_Check(op) \
362                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
363#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
364
365/* Fast access macros */
366#ifndef Py_LIMITED_API
367
368#define PyUnicode_WSTR_LENGTH(op) \
369    (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
370     ((PyASCIIObject*)op)->length :                    \
371     ((PyCompactUnicodeObject*)op)->wstr_length)
372
373/* Returns the deprecated Py_UNICODE representation's size in code units
374   (this includes surrogate pairs as 2 units).
375   If the Py_UNICODE representation is not available, it will be computed
376   on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
377
378#define PyUnicode_GET_SIZE(op) \
379    (assert(PyUnicode_Check(op)), \
380     (((PyASCIIObject *)(op))->wstr) ? \
381        PyUnicode_WSTR_LENGTH(op) :                   \
382        ((void)PyUnicode_AsUnicode((PyObject *)(op)), \
383         PyUnicode_WSTR_LENGTH(op)))
384
385#define PyUnicode_GET_DATA_SIZE(op) \
386    (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
387
388/* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
389   representation on demand.  Using this macro is very inefficient now,
390   try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
391   use PyUnicode_WRITE() and PyUnicode_READ(). */
392
393#define PyUnicode_AS_UNICODE(op) \
394    (assert(PyUnicode_Check(op)), \
395     (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
396      PyUnicode_AsUnicode((PyObject *)(op)))
397
398#define PyUnicode_AS_DATA(op) \
399    ((const char *)(PyUnicode_AS_UNICODE(op)))
400
401
402/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
403
404/* Values for PyUnicodeObject.state: */
405
406/* Interning state. */
407#define SSTATE_NOT_INTERNED 0
408#define SSTATE_INTERNED_MORTAL 1
409#define SSTATE_INTERNED_IMMORTAL 2
410
411/* Return true if the string contains only ASCII characters, or 0 if not. The
412   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. No type checks
413   or Ready calls are performed. */
414#define PyUnicode_IS_ASCII(op)                 \
415    (((PyASCIIObject*)op)->state.ascii)
416
417/* Return true if the string is compact or 0 if not.
418   No type checks or Ready calls are performed. */
419#define PyUnicode_IS_COMPACT(op) \
420    (((PyASCIIObject*)(op))->state.compact)
421
422/* Return true if the string is a compact ASCII string (use PyASCIIObject
423   structure), or 0 if not.  No type checks or Ready calls are performed. */
424#define PyUnicode_IS_COMPACT_ASCII(op)                 \
425    (PyUnicode_IS_ASCII(op) && PyUnicode_IS_COMPACT(op))
426
427/* String contains only wstr byte characters.  This is only possible
428   when the string was created with a legacy API and _PyUnicode_Ready()
429   has not been called yet.  */
430#define PyUnicode_WCHAR_KIND 0
431
432/* Return values of the PyUnicode_KIND() macro: */
433
434#define PyUnicode_1BYTE_KIND 1
435#define PyUnicode_2BYTE_KIND 2
436#define PyUnicode_4BYTE_KIND 3
437
438
439/* Return the number of bytes the string uses to represent single characters,
440   this can be 1, 2 or 4.
441
442   See also PyUnicode_KIND_SIZE(). */
443#define PyUnicode_CHARACTER_SIZE(op) \
444    (((Py_ssize_t)1 << (PyUnicode_KIND(op) - 1)))
445
446/* Return pointers to the canonical representation cast to unsigned char,
447   Py_UCS2, or Py_UCS4 for direct character access.
448   No checks are performed, use PyUnicode_CHARACTER_SIZE or
449   PyUnicode_KIND() before to ensure these will work correctly. */
450
451#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
452#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
453#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
454
455/* Return one of the PyUnicode_*_KIND values defined above. */
456#define PyUnicode_KIND(op) \
457    (assert(PyUnicode_Check(op)), \
458     assert(PyUnicode_IS_READY(op)),            \
459     ((PyASCIIObject *)(op))->state.kind)
460
461/* Return a void pointer to the raw unicode buffer. */
462#define _PyUnicode_COMPACT_DATA(op)                     \
463    (PyUnicode_IS_COMPACT_ASCII(op) ?                   \
464     ((void*)((PyASCIIObject*)(op) + 1)) :              \
465     ((void*)((PyCompactUnicodeObject*)(op) + 1)))
466
467#define _PyUnicode_NONCOMPACT_DATA(op)                  \
468    (assert(((PyUnicodeObject*)(op))->data.any),        \
469     ((((PyUnicodeObject *)(op))->data.any)))
470
471#define PyUnicode_DATA(op) \
472    (assert(PyUnicode_Check(op)), \
473     PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) :   \
474     _PyUnicode_NONCOMPACT_DATA(op))
475
476/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
477   The index is a character index, the result is a size in bytes.
478
479   See also PyUnicode_CHARACTER_SIZE(). */
480#define PyUnicode_KIND_SIZE(kind, index) \
481    (((Py_ssize_t)(index)) << ((kind) - 1))
482
483/* In the access macros below, "kind" may be evaluated more than once.
484   All other macro parameters are evaluated exactly once, so it is safe
485   to put side effects into them (such as increasing the index). */
486
487/* Write into the canonical representation, this macro does not do any sanity
488   checks and is intended for usage in loops.  The caller should cache the
489   kind and data pointers obtained from other macro calls.
490   index is the index in the string (starts at 0) and value is the new
491   code point value which should be written to that location. */
492#define PyUnicode_WRITE(kind, data, index, value) \
493    do { \
494        switch ((kind)) { \
495        case PyUnicode_1BYTE_KIND: { \
496            ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
497            break; \
498        } \
499        case PyUnicode_2BYTE_KIND: { \
500            ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
501            break; \
502        } \
503        default: { \
504            assert((kind) == PyUnicode_4BYTE_KIND); \
505            ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
506        } \
507        } \
508    } while (0)
509
510/* Read a code point from the string's canonical representation.  No checks
511   or ready calls are performed. */
512#define PyUnicode_READ(kind, data, index) \
513    ((Py_UCS4) \
514    ((kind) == PyUnicode_1BYTE_KIND ? \
515        ((const Py_UCS1 *)(data))[(index)] : \
516        ((kind) == PyUnicode_2BYTE_KIND ? \
517            ((const Py_UCS2 *)(data))[(index)] : \
518            ((const Py_UCS4 *)(data))[(index)] \
519        ) \
520    ))
521
522/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
523   calls PyUnicode_KIND() and might call it twice.  For single reads, use
524   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
525   cache kind and use PyUnicode_READ instead. */
526#define PyUnicode_READ_CHAR(unicode, index) \
527    (assert(PyUnicode_Check(unicode)),          \
528     assert(PyUnicode_IS_READY(unicode)),       \
529     (Py_UCS4)                                  \
530        (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
531            ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
532            (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
533                ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
534                ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
535            ) \
536        ))
537
538/* Returns the length of the unicode string. The caller has to make sure that
539   the string has it's canonical representation set before calling
540   this macro.  Call PyUnicode_(FAST_)Ready to ensure that. */
541#define PyUnicode_GET_LENGTH(op)                \
542    (assert(PyUnicode_Check(op)),               \
543     assert(PyUnicode_IS_READY(op)),            \
544     ((PyASCIIObject *)(op))->length)
545
546
547/* Fast check to determine whether an object is ready. Equivalent to
548   PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
549
550#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
551
552/* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
553   case.  If the canonical representation is not yet set, it will still call
554   _PyUnicode_Ready().
555   Returns 0 on success and -1 on errors. */
556#define PyUnicode_READY(op)                        \
557    (assert(PyUnicode_Check(op)),                       \
558     (PyUnicode_IS_READY(op) ?                          \
559      0 : _PyUnicode_Ready((PyObject *)(op))))
560
561/* Return a maximum character value which is suitable for creating another
562   string based on op.  This is always an approximation but more efficient
563   than iterating over the string. */
564#define PyUnicode_MAX_CHAR_VALUE(op) \
565    (assert(PyUnicode_IS_READY(op)),                                    \
566     (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f:                            \
567      (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ?                     \
568       (PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
569        (0x7fU) : (0xffU)                                                 \
570           ) :                                                          \
571       (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ?                    \
572        (0xffffU) : (0x10ffffU)                                           \
573           ))))
574
575#endif
576
577/* --- Constants ---------------------------------------------------------- */
578
579/* This Unicode character will be used as replacement character during
580   decoding if the errors argument is set to "replace". Note: the
581   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
582   Unicode 3.0. */
583
584#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
585
586/* === Public API ========================================================= */
587
588/* --- Plain Py_UNICODE --------------------------------------------------- */
589
590/* With PEP 393, this is the recommended way to allocate a new unicode object.
591   This function will allocate the object and its buffer in a single memory
592   block.  Objects created using this function are not resizable. */
593#ifndef Py_LIMITED_API
594PyAPI_FUNC(PyObject*) PyUnicode_New(
595    Py_ssize_t size,            /* Number of code points in the new string */
596    Py_UCS4 maxchar             /* maximum code point value in the string */
597    );
598#endif
599
600/* Initializes the canonical string representation from a the deprecated
601   wstr/Py_UNICODE representation. This function is used to convert Unicode
602   objects which were created using the old API to the new flexible format
603   introduced with PEP 393.
604
605   Don't call this function directly, use the public PyUnicode_READY() macro
606   instead. */
607#ifndef Py_LIMITED_API
608PyAPI_FUNC(int) _PyUnicode_Ready(
609    PyObject *unicode           /* Unicode object */
610    );
611#endif
612
613/* Get a copy of a Unicode string. */
614PyAPI_FUNC(PyObject*) PyUnicode_Copy(
615    PyObject *unicode
616    );
617
618/* Copy character from one unicode object into another, this function performs
619   character conversion when necessary and falls back to memcpy if possible.
620
621   Fail if to is too small (smaller than how_many or smaller than
622   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
623   kind(to), or if to has more than 1 reference.
624
625   Return the number of written character, or return -1 and raise an exception
626   on error.
627
628   Pseudo-code:
629
630       how_many = min(how_many, len(from) - from_start)
631       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
632       return how_many
633
634   Note: The function doesn't write a terminating null character.
635   */
636#ifndef Py_LIMITED_API
637PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
638    PyObject *to,
639    Py_ssize_t to_start,
640    PyObject *from,
641    Py_ssize_t from_start,
642    Py_ssize_t how_many
643    );
644#endif
645
646/* Create a Unicode Object from the Py_UNICODE buffer u of the given
647   size.
648
649   u may be NULL which causes the contents to be undefined. It is the
650   user's responsibility to fill in the needed data afterwards. Note
651   that modifying the Unicode object contents after construction is
652   only allowed if u was set to NULL.
653
654   The buffer is copied into the new object. */
655
656#ifndef Py_LIMITED_API
657PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
658    const Py_UNICODE *u,        /* Unicode buffer */
659    Py_ssize_t size             /* size of buffer */
660    );
661#endif
662
663/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
664PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
665    const char *u,             /* UTF-8 encoded string */
666    Py_ssize_t size            /* size of buffer */
667    );
668
669/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
670   UTF-8 encoded bytes.  The size is determined with strlen(). */
671PyAPI_FUNC(PyObject*) PyUnicode_FromString(
672    const char *u              /* UTF-8 encoded string */
673    );
674
675/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
676   Scan the string to find the maximum character. */
677#ifndef Py_LIMITED_API
678PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
679    int kind,
680    const void *buffer,
681    Py_ssize_t size);
682#endif
683
684PyAPI_FUNC(PyObject*) PyUnicode_Substring(
685    PyObject *str,
686    Py_ssize_t start,
687    Py_ssize_t end);
688
689/* Copy the string into a UCS4 buffer including the null character is copy_null
690   is set. Return NULL and raise an exception on error. Raise a ValueError if
691   the buffer is smaller than the string. Return buffer on success.
692
693   buflen is the length of the buffer in (Py_UCS4) characters. */
694PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
695    PyObject *unicode,
696    Py_UCS4* buffer,
697    Py_ssize_t buflen,
698    int copy_null);
699
700/* Copy the string into a UCS4 buffer. A new buffer is allocated using
701 * PyMem_Malloc; if this fails, NULL is returned with a memory error
702   exception set. */
703PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
704
705/* Return a read-only pointer to the Unicode object's internal
706   Py_UNICODE buffer.
707   If the wchar_t/Py_UNICODE representation is not yet available, this
708   function will calculate it. */
709
710#ifndef Py_LIMITED_API
711PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
712    PyObject *unicode           /* Unicode object */
713    );
714#endif
715
716/* Return a read-only pointer to the Unicode object's internal
717   Py_UNICODE buffer and save the length at size.
718   If the wchar_t/Py_UNICODE representation is not yet available, this
719   function will calculate it. */
720
721#ifndef Py_LIMITED_API
722PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
723    PyObject *unicode,          /* Unicode object */
724    Py_ssize_t *size            /* location where to save the length */
725    );
726#endif
727
728/* Get the length of the Unicode object. */
729
730PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
731    PyObject *unicode
732);
733
734/* Get the number of Py_UNICODE units in the
735   string representation. */
736
737PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
738    PyObject *unicode           /* Unicode object */
739    );
740
741/* Read a character from the string. */
742
743PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
744    PyObject *unicode,
745    Py_ssize_t index
746    );
747
748/* Write a character to the string. The string must have been created through
749   PyUnicode_New, must not be shared, and must not have been hashed yet.
750
751   Return 0 on success, -1 on error. */
752
753PyAPI_FUNC(int) PyUnicode_WriteChar(
754    PyObject *unicode,
755    Py_ssize_t index,
756    Py_UCS4 character
757    );
758
759#ifndef Py_LIMITED_API
760/* Get the maximum ordinal for a Unicode character. */
761PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
762#endif
763
764/* Resize an Unicode object allocated by the legacy API (e.g.
765   PyUnicode_FromUnicode). Unicode objects allocated by the new API (e.g.
766   PyUnicode_New) cannot be resized by this function.
767
768   The length is a number of Py_UNICODE characters (and not the number of code
769   points).
770
771   *unicode is modified to point to the new (resized) object and 0
772   returned on success.
773
774   If the refcount on the object is 1, the function resizes the string in
775   place, which is usually faster than allocating a new string (and copy
776   characters).
777
778   Error handling is implemented as follows: an exception is set, -1
779   is returned and *unicode left untouched. */
780
781PyAPI_FUNC(int) PyUnicode_Resize(
782    PyObject **unicode,         /* Pointer to the Unicode object */
783    Py_ssize_t length           /* New length */
784    );
785
786/* Coerce obj to an Unicode object and return a reference with
787   *incremented* refcount.
788
789   Coercion is done in the following way:
790
791   1. bytes, bytearray and other char buffer compatible objects are decoded
792      under the assumptions that they contain data using the UTF-8
793      encoding. Decoding is done in "strict" mode.
794
795   2. All other objects (including Unicode objects) raise an
796      exception.
797
798   The API returns NULL in case of an error. The caller is responsible
799   for decref'ing the returned objects.
800
801*/
802
803PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
804    register PyObject *obj,     /* Object */
805    const char *encoding,       /* encoding */
806    const char *errors          /* error handling */
807    );
808
809/* Coerce obj to an Unicode object and return a reference with
810   *incremented* refcount.
811
812   Unicode objects are passed back as-is (subclasses are converted to
813   true Unicode objects), all other objects are delegated to
814   PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
815   using UTF-8 encoding as basis for decoding the object.
816
817   The API returns NULL in case of an error. The caller is responsible
818   for decref'ing the returned objects.
819
820*/
821
822PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
823    register PyObject *obj      /* Object */
824    );
825
826PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
827    const char *format,   /* ASCII-encoded string  */
828    va_list vargs
829    );
830PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
831    const char *format,   /* ASCII-encoded string  */
832    ...
833    );
834
835#ifndef Py_LIMITED_API
836/* Format the object based on the format_spec, as defined in PEP 3101
837   (Advanced String Formatting). */
838PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
839                                                 PyObject *format_spec,
840                                                 Py_ssize_t start,
841                                                 Py_ssize_t end);
842#endif
843
844PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
845PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
846PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
847    const char *u              /* UTF-8 encoded string */
848    );
849#ifndef Py_LIMITED_API
850PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
851#endif
852
853/* Use only if you know it's a string */
854#define PyUnicode_CHECK_INTERNED(op) \
855    (((PyASCIIObject *)(op))->state.interned)
856
857/* --- wchar_t support for platforms which support it --------------------- */
858
859#ifdef HAVE_WCHAR_H
860
861/* Create a Unicode Object from the wchar_t buffer w of the given
862   size.
863
864   The buffer is copied into the new object. */
865
866PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
867    register const wchar_t *w,  /* wchar_t buffer */
868    Py_ssize_t size             /* size of buffer */
869    );
870
871/* Copies the Unicode Object contents into the wchar_t buffer w.  At
872   most size wchar_t characters are copied.
873
874   Note that the resulting wchar_t string may or may not be
875   0-terminated.  It is the responsibility of the caller to make sure
876   that the wchar_t string is 0-terminated in case this is required by
877   the application.
878
879   Returns the number of wchar_t characters copied (excluding a
880   possibly trailing 0-termination character) or -1 in case of an
881   error. */
882
883PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
884    PyObject *unicode,          /* Unicode object */
885    register wchar_t *w,        /* wchar_t buffer */
886    Py_ssize_t size             /* size of buffer */
887    );
888
889/* Convert the Unicode object to a wide character string. The output string
890   always ends with a nul character. If size is not NULL, write the number of
891   wide characters (excluding the null character) into *size.
892
893   Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it)
894   on success. On error, returns NULL, *size is undefined and raises a
895   MemoryError. */
896
897PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
898    PyObject *unicode,          /* Unicode object */
899    Py_ssize_t *size            /* number of characters of the result */
900    );
901
902#ifndef Py_LIMITED_API
903PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
904#endif
905
906#endif
907
908/* --- Unicode ordinals --------------------------------------------------- */
909
910/* Create a Unicode Object from the given Unicode code point ordinal.
911
912   The ordinal must be in range(0x10000) on narrow Python builds
913   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
914   raised in case it is not.
915
916*/
917
918PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
919
920/* --- Free-list management ----------------------------------------------- */
921
922/* Clear the free list used by the Unicode implementation.
923
924   This can be used to release memory used for objects on the free
925   list back to the Python memory allocator.
926
927*/
928
929PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
930
931/* === Builtin Codecs =====================================================
932
933   Many of these APIs take two arguments encoding and errors. These
934   parameters encoding and errors have the same semantics as the ones
935   of the builtin str() API.
936
937   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
938
939   Error handling is set by errors which may also be set to NULL
940   meaning to use the default handling defined for the codec. Default
941   error handling for all builtin codecs is "strict" (ValueErrors are
942   raised).
943
944   The codecs all use a similar interface. Only deviation from the
945   generic ones are documented.
946
947*/
948
949/* --- Manage the default encoding ---------------------------------------- */
950
951/* Returns a pointer to the default encoding (UTF-8) of the
952   Unicode object unicode and the size of the encoded representation
953   in bytes stored in *size.
954
955   In case of an error, no *size is set.
956
957   This function caches the UTF-8 encoded string in the unicodeobject
958   and subsequent calls will return the same string.  The memory is released
959   when the unicodeobject is deallocated.
960
961   _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
962   support the previous internal function with the same behaviour.
963
964   *** This API is for interpreter INTERNAL USE ONLY and will likely
965   *** be removed or changed in the future.
966
967   *** If you need to access the Unicode object as UTF-8 bytes string,
968   *** please use PyUnicode_AsUTF8String() instead.
969*/
970
971#ifndef Py_LIMITED_API
972PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
973    PyObject *unicode,
974    Py_ssize_t *size);
975#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
976#endif
977
978/* Returns a pointer to the default encoding (UTF-8) of the
979   Unicode object unicode.
980
981   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
982   in the unicodeobject.
983
984   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
985   support the previous internal function with the same behaviour.
986
987   Use of this API is DEPRECATED since no size information can be
988   extracted from the returned data.
989
990   *** This API is for interpreter INTERNAL USE ONLY and will likely
991   *** be removed or changed for Python 3.1.
992
993   *** If you need to access the Unicode object as UTF-8 bytes string,
994   *** please use PyUnicode_AsUTF8String() instead.
995
996*/
997
998#ifndef Py_LIMITED_API
999PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1000#define _PyUnicode_AsString PyUnicode_AsUTF8
1001#endif
1002
1003/* Returns "utf-8".  */
1004
1005PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
1006
1007/* --- Generic Codecs ----------------------------------------------------- */
1008
1009/* Create a Unicode object by decoding the encoded string s of the
1010   given size. */
1011
1012PyAPI_FUNC(PyObject*) PyUnicode_Decode(
1013    const char *s,              /* encoded string */
1014    Py_ssize_t size,            /* size of buffer */
1015    const char *encoding,       /* encoding */
1016    const char *errors          /* error handling */
1017    );
1018
1019/* Decode a Unicode object unicode and return the result as Python
1020   object. */
1021
1022PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
1023    PyObject *unicode,          /* Unicode object */
1024    const char *encoding,       /* encoding */
1025    const char *errors          /* error handling */
1026    );
1027
1028/* Decode a Unicode object unicode and return the result as Unicode
1029   object. */
1030
1031PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
1032    PyObject *unicode,          /* Unicode object */
1033    const char *encoding,       /* encoding */
1034    const char *errors          /* error handling */
1035    );
1036
1037/* Encodes a Py_UNICODE buffer of the given size and returns a
1038   Python string object. */
1039
1040#ifndef Py_LIMITED_API
1041PyAPI_FUNC(PyObject*) PyUnicode_Encode(
1042    const Py_UNICODE *s,        /* Unicode char buffer */
1043    Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
1044    const char *encoding,       /* encoding */
1045    const char *errors          /* error handling */
1046    );
1047#endif
1048
1049/* Encodes a Unicode object and returns the result as Python
1050   object. */
1051
1052PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
1053    PyObject *unicode,          /* Unicode object */
1054    const char *encoding,       /* encoding */
1055    const char *errors          /* error handling */
1056    );
1057
1058/* Encodes a Unicode object and returns the result as Python string
1059   object. */
1060
1061PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
1062    PyObject *unicode,          /* Unicode object */
1063    const char *encoding,       /* encoding */
1064    const char *errors          /* error handling */
1065    );
1066
1067/* Encodes a Unicode object and returns the result as Unicode
1068   object. */
1069
1070PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
1071    PyObject *unicode,          /* Unicode object */
1072    const char *encoding,       /* encoding */
1073    const char *errors          /* error handling */
1074    );
1075
1076/* Build an encoding map. */
1077
1078PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1079    PyObject* string            /* 256 character map */
1080   );
1081
1082/* --- UTF-7 Codecs ------------------------------------------------------- */
1083
1084PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
1085    const char *string,         /* UTF-7 encoded string */
1086    Py_ssize_t length,          /* size of string */
1087    const char *errors          /* error handling */
1088    );
1089
1090PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
1091    const char *string,         /* UTF-7 encoded string */
1092    Py_ssize_t length,          /* size of string */
1093    const char *errors,         /* error handling */
1094    Py_ssize_t *consumed        /* bytes consumed */
1095    );
1096
1097#ifndef Py_LIMITED_API
1098PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
1099    const Py_UNICODE *data,     /* Unicode char buffer */
1100    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1101    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1102    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1103    const char *errors          /* error handling */
1104    );
1105#endif
1106
1107/* --- UTF-8 Codecs ------------------------------------------------------- */
1108
1109PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
1110    const char *string,         /* UTF-8 encoded string */
1111    Py_ssize_t length,          /* size of string */
1112    const char *errors          /* error handling */
1113    );
1114
1115PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
1116    const char *string,         /* UTF-8 encoded string */
1117    Py_ssize_t length,          /* size of string */
1118    const char *errors,         /* error handling */
1119    Py_ssize_t *consumed        /* bytes consumed */
1120    );
1121
1122PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
1123    PyObject *unicode           /* Unicode object */
1124    );
1125
1126#ifndef Py_LIMITED_API
1127PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1128    PyObject *unicode,
1129    const char *errors);
1130
1131PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
1132    const Py_UNICODE *data,     /* Unicode char buffer */
1133    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1134    const char *errors          /* error handling */
1135    );
1136#endif
1137
1138/* --- UTF-32 Codecs ------------------------------------------------------ */
1139
1140/* Decodes length bytes from a UTF-32 encoded buffer string and returns
1141   the corresponding Unicode object.
1142
1143   errors (if non-NULL) defines the error handling. It defaults
1144   to "strict".
1145
1146   If byteorder is non-NULL, the decoder starts decoding using the
1147   given byte order:
1148
1149    *byteorder == -1: little endian
1150    *byteorder == 0:  native order
1151    *byteorder == 1:  big endian
1152
1153   In native mode, the first four bytes of the stream are checked for a
1154   BOM mark. If found, the BOM mark is analysed, the byte order
1155   adjusted and the BOM skipped.  In the other modes, no BOM mark
1156   interpretation is done. After completion, *byteorder is set to the
1157   current byte order at the end of input data.
1158
1159   If byteorder is NULL, the codec starts in native order mode.
1160
1161*/
1162
1163PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
1164    const char *string,         /* UTF-32 encoded string */
1165    Py_ssize_t length,          /* size of string */
1166    const char *errors,         /* error handling */
1167    int *byteorder              /* pointer to byteorder to use
1168                                   0=native;-1=LE,1=BE; updated on
1169                                   exit */
1170    );
1171
1172PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
1173    const char *string,         /* UTF-32 encoded string */
1174    Py_ssize_t length,          /* size of string */
1175    const char *errors,         /* error handling */
1176    int *byteorder,             /* pointer to byteorder to use
1177                                   0=native;-1=LE,1=BE; updated on
1178                                   exit */
1179    Py_ssize_t *consumed        /* bytes consumed */
1180    );
1181
1182/* Returns a Python string using the UTF-32 encoding in native byte
1183   order. The string always starts with a BOM mark.  */
1184
1185PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
1186    PyObject *unicode           /* Unicode object */
1187    );
1188
1189/* Returns a Python string object holding the UTF-32 encoded value of
1190   the Unicode data.
1191
1192   If byteorder is not 0, output is written according to the following
1193   byte order:
1194
1195   byteorder == -1: little endian
1196   byteorder == 0:  native byte order (writes a BOM mark)
1197   byteorder == 1:  big endian
1198
1199   If byteorder is 0, the output string will always start with the
1200   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1201   prepended.
1202
1203*/
1204
1205#ifndef Py_LIMITED_API
1206PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
1207    const Py_UNICODE *data,     /* Unicode char buffer */
1208    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1209    const char *errors,         /* error handling */
1210    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1211    );
1212#endif
1213
1214/* --- UTF-16 Codecs ------------------------------------------------------ */
1215
1216/* Decodes length bytes from a UTF-16 encoded buffer string and returns
1217   the corresponding Unicode object.
1218
1219   errors (if non-NULL) defines the error handling. It defaults
1220   to "strict".
1221
1222   If byteorder is non-NULL, the decoder starts decoding using the
1223   given byte order:
1224
1225    *byteorder == -1: little endian
1226    *byteorder == 0:  native order
1227    *byteorder == 1:  big endian
1228
1229   In native mode, the first two bytes of the stream are checked for a
1230   BOM mark. If found, the BOM mark is analysed, the byte order
1231   adjusted and the BOM skipped.  In the other modes, no BOM mark
1232   interpretation is done. After completion, *byteorder is set to the
1233   current byte order at the end of input data.
1234
1235   If byteorder is NULL, the codec starts in native order mode.
1236
1237*/
1238
1239PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
1240    const char *string,         /* UTF-16 encoded string */
1241    Py_ssize_t length,          /* size of string */
1242    const char *errors,         /* error handling */
1243    int *byteorder              /* pointer to byteorder to use
1244                                   0=native;-1=LE,1=BE; updated on
1245                                   exit */
1246    );
1247
1248PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
1249    const char *string,         /* UTF-16 encoded string */
1250    Py_ssize_t length,          /* size of string */
1251    const char *errors,         /* error handling */
1252    int *byteorder,             /* pointer to byteorder to use
1253                                   0=native;-1=LE,1=BE; updated on
1254                                   exit */
1255    Py_ssize_t *consumed        /* bytes consumed */
1256    );
1257
1258/* Returns a Python string using the UTF-16 encoding in native byte
1259   order. The string always starts with a BOM mark.  */
1260
1261PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
1262    PyObject *unicode           /* Unicode object */
1263    );
1264
1265/* Returns a Python string object holding the UTF-16 encoded value of
1266   the Unicode data.
1267
1268   If byteorder is not 0, output is written according to the following
1269   byte order:
1270
1271   byteorder == -1: little endian
1272   byteorder == 0:  native byte order (writes a BOM mark)
1273   byteorder == 1:  big endian
1274
1275   If byteorder is 0, the output string will always start with the
1276   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1277   prepended.
1278
1279   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1280   UCS-2. This trick makes it possible to add full UTF-16 capabilities
1281   at a later point without compromising the APIs.
1282
1283*/
1284
1285#ifndef Py_LIMITED_API
1286PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
1287    const Py_UNICODE *data,     /* Unicode char buffer */
1288    Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1289    const char *errors,         /* error handling */
1290    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1291    );
1292#endif
1293
1294/* --- Unicode-Escape Codecs ---------------------------------------------- */
1295
1296PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
1297    const char *string,         /* Unicode-Escape encoded string */
1298    Py_ssize_t length,          /* size of string */
1299    const char *errors          /* error handling */
1300    );
1301
1302PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
1303    PyObject *unicode           /* Unicode object */
1304    );
1305
1306#ifndef Py_LIMITED_API
1307PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
1308    const Py_UNICODE *data,     /* Unicode char buffer */
1309    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1310    );
1311#endif
1312
1313/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1314
1315PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
1316    const char *string,         /* Raw-Unicode-Escape encoded string */
1317    Py_ssize_t length,          /* size of string */
1318    const char *errors          /* error handling */
1319    );
1320
1321PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
1322    PyObject *unicode           /* Unicode object */
1323    );
1324
1325#ifndef Py_LIMITED_API
1326PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
1327    const Py_UNICODE *data,     /* Unicode char buffer */
1328    Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1329    );
1330#endif
1331
1332/* --- Unicode Internal Codec ---------------------------------------------
1333
1334    Only for internal use in _codecsmodule.c */
1335
1336#ifndef Py_LIMITED_API
1337PyObject *_PyUnicode_DecodeUnicodeInternal(
1338    const char *string,
1339    Py_ssize_t length,
1340    const char *errors
1341    );
1342#endif
1343
1344/* --- Latin-1 Codecs -----------------------------------------------------
1345
1346   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1347
1348*/
1349
1350PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1351    const char *string,         /* Latin-1 encoded string */
1352    Py_ssize_t length,          /* size of string */
1353    const char *errors          /* error handling */
1354    );
1355
1356PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1357    PyObject *unicode           /* Unicode object */
1358    );
1359
1360#ifndef Py_LIMITED_API
1361PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1362    PyObject* unicode,
1363    const char* errors);
1364
1365PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1366    const Py_UNICODE *data,     /* Unicode char buffer */
1367    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1368    const char *errors          /* error handling */
1369    );
1370#endif
1371
1372/* --- ASCII Codecs -------------------------------------------------------
1373
1374   Only 7-bit ASCII data is excepted. All other codes generate errors.
1375
1376*/
1377
1378PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1379    const char *string,         /* ASCII encoded string */
1380    Py_ssize_t length,          /* size of string */
1381    const char *errors          /* error handling */
1382    );
1383
1384PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1385    PyObject *unicode           /* Unicode object */
1386    );
1387
1388#ifndef Py_LIMITED_API
1389PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1390    PyObject* unicode,
1391    const char* errors);
1392
1393PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1394    const Py_UNICODE *data,     /* Unicode char buffer */
1395    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1396    const char *errors          /* error handling */
1397    );
1398#endif
1399
1400/* --- Character Map Codecs -----------------------------------------------
1401
1402   This codec uses mappings to encode and decode characters.
1403
1404   Decoding mappings must map single string characters to single
1405   Unicode characters, integers (which are then interpreted as Unicode
1406   ordinals) or None (meaning "undefined mapping" and causing an
1407   error).
1408
1409   Encoding mappings must map single Unicode characters to single
1410   string characters, integers (which are then interpreted as Latin-1
1411   ordinals) or None (meaning "undefined mapping" and causing an
1412   error).
1413
1414   If a character lookup fails with a LookupError, the character is
1415   copied as-is meaning that its ordinal value will be interpreted as
1416   Unicode or Latin-1 ordinal resp. Because of this mappings only need
1417   to contain those mappings which map characters to different code
1418   points.
1419
1420*/
1421
1422PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1423    const char *string,         /* Encoded string */
1424    Py_ssize_t length,          /* size of string */
1425    PyObject *mapping,          /* character mapping
1426                                   (char ordinal -> unicode ordinal) */
1427    const char *errors          /* error handling */
1428    );
1429
1430PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1431    PyObject *unicode,          /* Unicode object */
1432    PyObject *mapping           /* character mapping
1433                                   (unicode ordinal -> char ordinal) */
1434    );
1435
1436#ifndef Py_LIMITED_API
1437PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1438    const Py_UNICODE *data,     /* Unicode char buffer */
1439    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1440    PyObject *mapping,          /* character mapping
1441                                   (unicode ordinal -> char ordinal) */
1442    const char *errors          /* error handling */
1443    );
1444#endif
1445
1446/* Translate a Py_UNICODE buffer of the given length by applying a
1447   character mapping table to it and return the resulting Unicode
1448   object.
1449
1450   The mapping table must map Unicode ordinal integers to Unicode
1451   ordinal integers or None (causing deletion of the character).
1452
1453   Mapping tables may be dictionaries or sequences. Unmapped character
1454   ordinals (ones which cause a LookupError) are left untouched and
1455   are copied as-is.
1456
1457*/
1458
1459#ifndef Py_LIMITED_API
1460PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1461    const Py_UNICODE *data,     /* Unicode char buffer */
1462    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1463    PyObject *table,            /* Translate table */
1464    const char *errors          /* error handling */
1465    );
1466#endif
1467
1468#ifdef HAVE_MBCS
1469
1470/* --- MBCS codecs for Windows -------------------------------------------- */
1471
1472PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1473    const char *string,         /* MBCS encoded string */
1474    Py_ssize_t length,              /* size of string */
1475    const char *errors          /* error handling */
1476    );
1477
1478PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1479    const char *string,         /* MBCS encoded string */
1480    Py_ssize_t length,          /* size of string */
1481    const char *errors,         /* error handling */
1482    Py_ssize_t *consumed        /* bytes consumed */
1483    );
1484
1485PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1486    PyObject *unicode           /* Unicode object */
1487    );
1488
1489#ifndef Py_LIMITED_API
1490PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1491    const Py_UNICODE *data,     /* Unicode char buffer */
1492    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1493    const char *errors          /* error handling */
1494    );
1495#endif
1496
1497#endif /* HAVE_MBCS */
1498
1499/* --- Decimal Encoder ---------------------------------------------------- */
1500
1501/* Takes a Unicode string holding a decimal value and writes it into
1502   an output buffer using standard ASCII digit codes.
1503
1504   The output buffer has to provide at least length+1 bytes of storage
1505   area. The output string is 0-terminated.
1506
1507   The encoder converts whitespace to ' ', decimal characters to their
1508   corresponding ASCII digit and all other Latin-1 characters except
1509   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1510   are treated as errors. This includes embedded NULL bytes.
1511
1512   Error handling is defined by the errors argument:
1513
1514      NULL or "strict": raise a ValueError
1515      "ignore": ignore the wrong characters (these are not copied to the
1516                output buffer)
1517      "replace": replaces illegal characters with '?'
1518
1519   Returns 0 on success, -1 on failure.
1520
1521*/
1522
1523#ifndef Py_LIMITED_API
1524PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1525    Py_UNICODE *s,              /* Unicode buffer */
1526    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1527    char *output,               /* Output buffer; must have size >= length */
1528    const char *errors          /* error handling */
1529    );
1530#endif
1531
1532/* Transforms code points that have decimal digit property to the
1533   corresponding ASCII digit code points.
1534
1535   Returns a new Unicode string on success, NULL on failure.
1536*/
1537
1538#ifndef Py_LIMITED_API
1539PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1540    Py_UNICODE *s,              /* Unicode buffer */
1541    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
1542    );
1543#endif
1544
1545/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
1546   as argument instead of a raw buffer and length.  This function additionally
1547   transforms spaces to ASCII because this is what the callers in longobject,
1548   floatobject, and complexobject did anyways. */
1549
1550#ifndef Py_LIMITED_API
1551PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1552    PyObject *unicode           /* Unicode object */
1553    );
1554#endif
1555
1556/* --- File system encoding ---------------------------------------------- */
1557
1558/* ParseTuple converter: encode str objects to bytes using
1559   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1560
1561PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1562
1563/* ParseTuple converter: decode bytes objects to unicode using
1564   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1565
1566PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1567
1568/* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1569   and the "surrogateescape" error handler.
1570
1571   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1572   encoding.
1573
1574   Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1575*/
1576
1577PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1578    const char *s               /* encoded string */
1579    );
1580
1581/* Decode a string using Py_FileSystemDefaultEncoding
1582   and the "surrogateescape" error handler.
1583
1584   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1585   encoding.
1586*/
1587
1588PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1589    const char *s,               /* encoded string */
1590    Py_ssize_t size              /* size */
1591    );
1592
1593/* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1594   "surrogateescape" error handler, and return bytes.
1595
1596   If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1597   encoding.
1598*/
1599
1600PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1601    PyObject *unicode
1602    );
1603
1604/* --- Methods & Slots ----------------------------------------------------
1605
1606   These are capable of handling Unicode objects and strings on input
1607   (we refer to them as strings in the descriptions) and return
1608   Unicode objects or integers as appropriate. */
1609
1610/* Concat two strings giving a new Unicode string. */
1611
1612PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1613    PyObject *left,             /* Left string */
1614    PyObject *right             /* Right string */
1615    );
1616
1617/* Concat two strings and put the result in *pleft
1618   (sets *pleft to NULL on error) */
1619
1620PyAPI_FUNC(void) PyUnicode_Append(
1621    PyObject **pleft,           /* Pointer to left string */
1622    PyObject *right             /* Right string */
1623    );
1624
1625/* Concat two strings, put the result in *pleft and drop the right object
1626   (sets *pleft to NULL on error) */
1627
1628PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1629    PyObject **pleft,           /* Pointer to left string */
1630    PyObject *right             /* Right string */
1631    );
1632
1633/* Split a string giving a list of Unicode strings.
1634
1635   If sep is NULL, splitting will be done at all whitespace
1636   substrings. Otherwise, splits occur at the given separator.
1637
1638   At most maxsplit splits will be done. If negative, no limit is set.
1639
1640   Separators are not included in the resulting list.
1641
1642*/
1643
1644PyAPI_FUNC(PyObject*) PyUnicode_Split(
1645    PyObject *s,                /* String to split */
1646    PyObject *sep,              /* String separator */
1647    Py_ssize_t maxsplit         /* Maxsplit count */
1648    );
1649
1650/* Dito, but split at line breaks.
1651
1652   CRLF is considered to be one line break. Line breaks are not
1653   included in the resulting list. */
1654
1655PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1656    PyObject *s,                /* String to split */
1657    int keepends                /* If true, line end markers are included */
1658    );
1659
1660/* Partition a string using a given separator. */
1661
1662PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1663    PyObject *s,                /* String to partition */
1664    PyObject *sep               /* String separator */
1665    );
1666
1667/* Partition a string using a given separator, searching from the end of the
1668   string. */
1669
1670PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1671    PyObject *s,                /* String to partition */
1672    PyObject *sep               /* String separator */
1673    );
1674
1675/* Split a string giving a list of Unicode strings.
1676
1677   If sep is NULL, splitting will be done at all whitespace
1678   substrings. Otherwise, splits occur at the given separator.
1679
1680   At most maxsplit splits will be done. But unlike PyUnicode_Split
1681   PyUnicode_RSplit splits from the end of the string. If negative,
1682   no limit is set.
1683
1684   Separators are not included in the resulting list.
1685
1686*/
1687
1688PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1689    PyObject *s,                /* String to split */
1690    PyObject *sep,              /* String separator */
1691    Py_ssize_t maxsplit         /* Maxsplit count */
1692    );
1693
1694/* Translate a string by applying a character mapping table to it and
1695   return the resulting Unicode object.
1696
1697   The mapping table must map Unicode ordinal integers to Unicode
1698   ordinal integers or None (causing deletion of the character).
1699
1700   Mapping tables may be dictionaries or sequences. Unmapped character
1701   ordinals (ones which cause a LookupError) are left untouched and
1702   are copied as-is.
1703
1704*/
1705
1706PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1707    PyObject *str,              /* String */
1708    PyObject *table,            /* Translate table */
1709    const char *errors          /* error handling */
1710    );
1711
1712/* Join a sequence of strings using the given separator and return
1713   the resulting Unicode string. */
1714
1715PyAPI_FUNC(PyObject*) PyUnicode_Join(
1716    PyObject *separator,        /* Separator string */
1717    PyObject *seq               /* Sequence object */
1718    );
1719
1720/* Return 1 if substr matches str[start:end] at the given tail end, 0
1721   otherwise. */
1722
1723PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1724    PyObject *str,              /* String */
1725    PyObject *substr,           /* Prefix or Suffix string */
1726    Py_ssize_t start,           /* Start index */
1727    Py_ssize_t end,             /* Stop index */
1728    int direction               /* Tail end: -1 prefix, +1 suffix */
1729    );
1730
1731/* Return the first position of substr in str[start:end] using the
1732   given search direction or -1 if not found. -2 is returned in case
1733   an error occurred and an exception is set. */
1734
1735PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1736    PyObject *str,              /* String */
1737    PyObject *substr,           /* Substring to find */
1738    Py_ssize_t start,           /* Start index */
1739    Py_ssize_t end,             /* Stop index */
1740    int direction               /* Find direction: +1 forward, -1 backward */
1741    );
1742
1743/* Like PyUnicode_Find, but search for single character only. */
1744PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
1745    PyObject *str,
1746    Py_UCS4 ch,
1747    Py_ssize_t start,
1748    Py_ssize_t end,
1749    int direction
1750    );
1751
1752/* Count the number of occurrences of substr in str[start:end]. */
1753
1754PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1755    PyObject *str,              /* String */
1756    PyObject *substr,           /* Substring to count */
1757    Py_ssize_t start,           /* Start index */
1758    Py_ssize_t end              /* Stop index */
1759    );
1760
1761/* Replace at most maxcount occurrences of substr in str with replstr
1762   and return the resulting Unicode object. */
1763
1764PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1765    PyObject *str,              /* String */
1766    PyObject *substr,           /* Substring to find */
1767    PyObject *replstr,          /* Substring to replace */
1768    Py_ssize_t maxcount         /* Max. number of replacements to apply;
1769                                   -1 = all */
1770    );
1771
1772/* Compare two strings and return -1, 0, 1 for less than, equal,
1773   greater than resp. */
1774
1775PyAPI_FUNC(int) PyUnicode_Compare(
1776    PyObject *left,             /* Left string */
1777    PyObject *right             /* Right string */
1778    );
1779
1780PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1781    PyObject *left,
1782    const char *right           /* ASCII-encoded string */
1783    );
1784
1785/* Rich compare two strings and return one of the following:
1786
1787   - NULL in case an exception was raised
1788   - Py_True or Py_False for successfully comparisons
1789   - Py_NotImplemented in case the type combination is unknown
1790
1791   Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1792   case the conversion of the arguments to Unicode fails with a
1793   UnicodeDecodeError.
1794
1795   Possible values for op:
1796
1797     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1798
1799*/
1800
1801PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1802    PyObject *left,             /* Left string */
1803    PyObject *right,            /* Right string */
1804    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1805    );
1806
1807/* Apply a argument tuple or dictionary to a format string and return
1808   the resulting Unicode string. */
1809
1810PyAPI_FUNC(PyObject *) PyUnicode_Format(
1811    PyObject *format,           /* Format string */
1812    PyObject *args              /* Argument tuple or dictionary */
1813    );
1814
1815/* Checks whether element is contained in container and return 1/0
1816   accordingly.
1817
1818   element has to coerce to an one element Unicode string. -1 is
1819   returned in case of an error. */
1820
1821PyAPI_FUNC(int) PyUnicode_Contains(
1822    PyObject *container,        /* Container string */
1823    PyObject *element           /* Element string */
1824    );
1825
1826/* Checks whether argument is a valid identifier. */
1827
1828PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1829
1830#ifndef Py_LIMITED_API
1831/* Externally visible for str.strip(unicode) */
1832PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1833    PyUnicodeObject *self,
1834    int striptype,
1835    PyObject *sepobj
1836    );
1837#endif
1838
1839/* Using the current locale, insert the thousands grouping
1840   into the string pointed to by buffer.  For the argument descriptions,
1841   see Objects/stringlib/localeutil.h */
1842
1843#ifndef Py_LIMITED_API
1844PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1845                                                   Py_ssize_t n_buffer,
1846                                                   Py_UNICODE *digits,
1847                                                   Py_ssize_t n_digits,
1848                                                   Py_ssize_t min_width);
1849#endif
1850
1851/* Using explicit passed-in values, insert the thousands grouping
1852   into the string pointed to by buffer.  For the argument descriptions,
1853   see Objects/stringlib/localeutil.h */
1854#ifndef Py_LIMITED_API
1855PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1856    PyObject *unicode,
1857    int kind,
1858    void *buffer,
1859    Py_ssize_t n_buffer,
1860    void *digits,
1861    Py_ssize_t n_digits,
1862    Py_ssize_t min_width,
1863    const char *grouping,
1864    const char *thousands_sep);
1865#endif
1866/* === Characters Type APIs =============================================== */
1867
1868/* Helper array used by Py_UNICODE_ISSPACE(). */
1869
1870#ifndef Py_LIMITED_API
1871PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1872
1873/* These should not be used directly. Use the Py_UNICODE_IS* and
1874   Py_UNICODE_TO* macros instead.
1875
1876   These APIs are implemented in Objects/unicodectype.c.
1877
1878*/
1879
1880PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1881    Py_UCS4 ch       /* Unicode character */
1882    );
1883
1884PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1885    Py_UCS4 ch       /* Unicode character */
1886    );
1887
1888PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1889    Py_UCS4 ch       /* Unicode character */
1890    );
1891
1892PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1893    Py_UCS4 ch       /* Unicode character */
1894    );
1895
1896PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1897    Py_UCS4 ch       /* Unicode character */
1898    );
1899
1900PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1901    const Py_UCS4 ch         /* Unicode character */
1902    );
1903
1904PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1905    const Py_UCS4 ch         /* Unicode character */
1906    );
1907
1908PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1909    Py_UCS4 ch       /* Unicode character */
1910    );
1911
1912PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1913    Py_UCS4 ch       /* Unicode character */
1914    );
1915
1916PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1917    Py_UCS4 ch       /* Unicode character */
1918    );
1919
1920PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1921    Py_UCS4 ch       /* Unicode character */
1922    );
1923
1924PyAPI_FUNC(int) _PyUnicode_ToDigit(
1925    Py_UCS4 ch       /* Unicode character */
1926    );
1927
1928PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1929    Py_UCS4 ch       /* Unicode character */
1930    );
1931
1932PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1933    Py_UCS4 ch       /* Unicode character */
1934    );
1935
1936PyAPI_FUNC(int) _PyUnicode_IsDigit(
1937    Py_UCS4 ch       /* Unicode character */
1938    );
1939
1940PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1941    Py_UCS4 ch       /* Unicode character */
1942    );
1943
1944PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1945    Py_UCS4 ch       /* Unicode character */
1946    );
1947
1948PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1949    Py_UCS4 ch       /* Unicode character */
1950    );
1951
1952PyAPI_FUNC(size_t) Py_UNICODE_strlen(
1953    const Py_UNICODE *u
1954    );
1955
1956PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1957    Py_UNICODE *s1,
1958    const Py_UNICODE *s2);
1959
1960PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
1961    Py_UNICODE *s1, const Py_UNICODE *s2);
1962
1963PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1964    Py_UNICODE *s1,
1965    const Py_UNICODE *s2,
1966    size_t n);
1967
1968PyAPI_FUNC(int) Py_UNICODE_strcmp(
1969    const Py_UNICODE *s1,
1970    const Py_UNICODE *s2
1971    );
1972
1973PyAPI_FUNC(int) Py_UNICODE_strncmp(
1974    const Py_UNICODE *s1,
1975    const Py_UNICODE *s2,
1976    size_t n
1977    );
1978
1979PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1980    const Py_UNICODE *s,
1981    Py_UNICODE c
1982    );
1983
1984PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
1985    const Py_UNICODE *s,
1986    Py_UNICODE c
1987    );
1988
1989PyAPI_FUNC(size_t) Py_UCS4_strlen(
1990    const Py_UCS4 *u
1991    );
1992
1993PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
1994    Py_UCS4 *s1,
1995    const Py_UCS4 *s2);
1996
1997PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
1998    Py_UCS4 *s1, const Py_UCS4 *s2);
1999
2000PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
2001    Py_UCS4 *s1,
2002    const Py_UCS4 *s2,
2003    size_t n);
2004
2005PyAPI_FUNC(int) Py_UCS4_strcmp(
2006    const Py_UCS4 *s1,
2007    const Py_UCS4 *s2
2008    );
2009
2010PyAPI_FUNC(int) Py_UCS4_strncmp(
2011    const Py_UCS4 *s1,
2012    const Py_UCS4 *s2,
2013    size_t n
2014    );
2015
2016PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
2017    const Py_UCS4 *s,
2018    Py_UCS4 c
2019    );
2020
2021PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
2022    const Py_UCS4 *s,
2023    Py_UCS4 c
2024    );
2025
2026/* Create a copy of a unicode string ending with a nul character. Return NULL
2027   and raise a MemoryError exception on memory allocation failure, otherwise
2028   return a new allocated buffer (use PyMem_Free() to free the buffer). */
2029
2030PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
2031    PyObject *unicode
2032    );
2033#endif /* Py_LIMITED_API */
2034
2035#if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
2036/* FIXME: use PyObject* type for op */
2037PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
2038    void *op,
2039    int check_content);
2040#endif
2041
2042#ifdef __cplusplus
2043}
2044#endif
2045#endif /* !Py_UNICODEOBJECT_H */
2046