unicodeobject.h revision f03e74126e5702edab33148140e84d21471424ce
1#ifndef Py_UNICODEOBJECT_H
2#define Py_UNICODEOBJECT_H
3
4/*
5
6Unicode implementation based on original code by Fredrik Lundh,
7modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8Unicode Integration Proposal (see file Misc/unicode.txt).
9
10(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
11
12
13 Original header:
14 --------------------------------------------------------------------
15
16 * Yet another Unicode string type for Python.  This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
18 *
19 * Written by Fredrik Lundh, January 1999.
20 *
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
23 *
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
26 *
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
29 *
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
32 *
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
36 *
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
45 *
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
54
55#include "ctype.h"
56
57/* === Internal API ======================================================= */
58
59/* --- Internal Unicode Format -------------------------------------------- */
60
61/* Set these flags if the platform has "wchar.h", "wctype.h" and the
62   wchar_t type is a 16-bit unsigned type */
63/* #define HAVE_WCHAR_H */
64/* #define HAVE_USABLE_WCHAR_T */
65
66/* Defaults for various platforms */
67#ifndef HAVE_USABLE_WCHAR_T
68
69/* Windows has a usable wchar_t type */
70# if defined(MS_WIN32)
71#  define HAVE_USABLE_WCHAR_T
72# endif
73
74#endif
75
76/* If the compiler provides a wchar_t type we try to support it
77   through the interface functions PyUnicode_FromWideChar() and
78   PyUnicode_AsWideChar(). */
79
80#ifdef HAVE_USABLE_WCHAR_T
81# define HAVE_WCHAR_H
82#endif
83
84#ifdef HAVE_WCHAR_H
85/* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
86# ifdef _HAVE_BSDI
87#  include <time.h>
88# endif
89# include "wchar.h"
90#endif
91
92#ifdef HAVE_USABLE_WCHAR_T
93
94/* If the compiler defines whcar_t as a 16-bit unsigned type we can
95   use the compiler type directly.  Works fine with all modern Windows
96   platforms. */
97
98typedef wchar_t Py_UNICODE;
99
100#else
101
102/* Use if you have a standard ANSI compiler, without wchar_t support.
103   If a short is not 16 bits on your platform, you have to fix the
104   typedef below, or the module initialization code will complain. */
105
106typedef unsigned short Py_UNICODE;
107
108#endif
109
110/* --- Internal Unicode Operations ---------------------------------------- */
111
112/* If you want Python to use the compiler's wctype.h functions instead
113   of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
114   configure Python using --with-ctype-functions.  This reduces the
115   interpreter's code size. */
116
117#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
118
119#include "wctype.h"
120
121#define Py_UNICODE_ISSPACE(ch) iswspace(ch)
122
123#define Py_UNICODE_ISLOWER(ch) iswlower(ch)
124#define Py_UNICODE_ISUPPER(ch) iswupper(ch)
125#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
126#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
127
128#define Py_UNICODE_TOLOWER(ch) towlower(ch)
129#define Py_UNICODE_TOUPPER(ch) towupper(ch)
130#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
131
132#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
133#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
134#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
135
136#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
137#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
138#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
139
140#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
141
142#else
143
144#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
145
146#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
147#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
148#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
149#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
150
151#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
152#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
153#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
154
155#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
156#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
157#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
158
159#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
160#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
161#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
162
163#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
164
165#endif
166
167#define Py_UNICODE_ISALNUM(ch) \
168       (Py_UNICODE_ISALPHA(ch) || \
169        Py_UNICODE_ISDECIMAL(ch) || \
170        Py_UNICODE_ISDIGIT(ch) || \
171        Py_UNICODE_ISNUMERIC(ch))
172
173#define Py_UNICODE_COPY(target, source, length)\
174    (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
175
176#define Py_UNICODE_FILL(target, value, length) do\
177    {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
178    while (0)
179
180#define Py_UNICODE_MATCH(string, offset, substring)\
181    ((*((string)->str + (offset)) == *((substring)->str)) &&\
182     !memcmp((string)->str + (offset), (substring)->str,\
183             (substring)->length*sizeof(Py_UNICODE)))
184
185#ifdef __cplusplus
186extern "C" {
187#endif
188
189/* --- Unicode Type ------------------------------------------------------- */
190
191typedef struct {
192    PyObject_HEAD
193    int length;			/* Length of raw Unicode data in buffer */
194    Py_UNICODE *str;		/* Raw Unicode buffer */
195    long hash;			/* Hash value; -1 if not set */
196    PyObject *utf8str;		/* UTF-8 encoded version as Python string,
197				   or NULL */
198} PyUnicodeObject;
199
200extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
201
202#define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
203
204/* Fast access macros */
205#define PyUnicode_GET_SIZE(op) \
206        (((PyUnicodeObject *)(op))->length)
207#define PyUnicode_GET_DATA_SIZE(op) \
208        (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
209#define PyUnicode_AS_UNICODE(op) \
210        (((PyUnicodeObject *)(op))->str)
211#define PyUnicode_AS_DATA(op) \
212        ((const char *)((PyUnicodeObject *)(op))->str)
213
214/* --- Constants ---------------------------------------------------------- */
215
216/* This Unicode character will be used as replacement character during
217   decoding if the errors argument is set to "replace". Note: the
218   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
219   Unicode 3.0. */
220
221#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
222
223/* === Public API ========================================================= */
224
225/* --- Plain Py_UNICODE --------------------------------------------------- */
226
227/* Create a Unicode Object from the Py_UNICODE buffer u of the given
228   size. u may be NULL which causes the contents to be undefined. It
229   is the user's responsibility to fill in the needed data.
230
231   The buffer is copied into the new object. */
232
233extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
234    const Py_UNICODE *u,        /* Unicode buffer */
235    int size                    /* size of buffer */
236    );
237
238/* Return a read-only pointer to the Unicode object's internal
239   Py_UNICODE buffer. */
240
241extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
242    PyObject *unicode	 	/* Unicode object */
243    );
244
245/* Get the length of the Unicode object. */
246
247extern DL_IMPORT(int) PyUnicode_GetSize(
248    PyObject *unicode	 	/* Unicode object */
249    );
250
251/* Resize an already allocated Unicode object to the new size length.
252
253   *unicode is modified to point to the new (resized) object and 0
254   returned on success.
255
256   This API may only be called by the function which also called the
257   Unicode constructor. The refcount on the object must be 1. Otherwise,
258   an error is returned.
259
260   Error handling is implemented as follows: an exception is set, -1
261   is returned and *unicode left untouched.
262
263*/
264
265extern DL_IMPORT(int) PyUnicode_Resize(
266    PyObject **unicode,		/* Pointer to the Unicode object */
267    int length			/* New length */
268    );
269
270/* Coerce obj to an Unicode object and return a reference with
271   *incremented* refcount.
272
273   Coercion is done in the following way:
274
275   1. Unicode objects are passed back as-is with incremented
276      refcount.
277
278   2. String and other char buffer compatible objects are decoded
279      under the assumptions that they contain data using the current
280      default encoding. Decoding is done in "strict" mode.
281
282   3. All other objects raise an exception.
283
284   The API returns NULL in case of an error. The caller is responsible
285   for decref'ing the returned objects.
286
287*/
288
289extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
290    register PyObject *obj 	/* Object */
291    );
292
293/* --- wchar_t support for platforms which support it --------------------- */
294
295#ifdef HAVE_WCHAR_H
296
297/* Create a Unicode Object from the whcar_t buffer w of the given
298   size.
299
300   The buffer is copied into the new object. */
301
302extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
303    register const wchar_t *w,  /* wchar_t buffer */
304    int size                    /* size of buffer */
305    );
306
307/* Copies the Unicode Object contents into the whcar_t buffer w.  At
308   most size wchar_t characters are copied.
309
310   Returns the number of wchar_t characters copied or -1 in case of an
311   error. */
312
313extern DL_IMPORT(int) PyUnicode_AsWideChar(
314    PyUnicodeObject *unicode,   /* Unicode object */
315    register wchar_t *w,        /* wchar_t buffer */
316    int size                    /* size of buffer */
317    );
318
319#endif
320
321/* === Builtin Codecs =====================================================
322
323   Many of these APIs take two arguments encoding and errors. These
324   parameters encoding and errors have the same semantics as the ones
325   of the builtin unicode() API.
326
327   Setting encoding to NULL causes the default encoding to be used.
328
329   Error handling is set by errors which may also be set to NULL
330   meaning to use the default handling defined for the codec. Default
331   error handling for all builtin codecs is "strict" (ValueErrors are
332   raised).
333
334   The codecs all use a similar interface. Only deviation from the
335   generic ones are documented.
336
337*/
338
339/* --- Manage the default encoding ---------------------------------------- */
340
341/* Returns the currently active default encoding.
342
343   The default encoding is currently implemented as run-time settable
344   process global.  This may change in future versions of the
345   interpreter to become a parameter which is managed on a per-thread
346   basis.
347
348 */
349
350extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding();
351
352/* Sets the currently active default encoding.
353
354   Returns 0 on success, -1 in case of an error.
355
356 */
357
358extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
359    const char *encoding	/* Encoding name in standard form */
360    );
361
362/* --- Generic Codecs ----------------------------------------------------- */
363
364/* Create a Unicode object by decoding the encoded string s of the
365   given size. */
366
367extern DL_IMPORT(PyObject*) PyUnicode_Decode(
368    const char *s,              /* encoded string */
369    int size,                   /* size of buffer */
370    const char *encoding,       /* encoding */
371    const char *errors          /* error handling */
372    );
373
374/* Encodes a Py_UNICODE buffer of the given size and returns a
375   Python string object. */
376
377extern DL_IMPORT(PyObject*) PyUnicode_Encode(
378    const Py_UNICODE *s,        /* Unicode char buffer */
379    int size,                   /* number of Py_UNICODE chars to encode */
380    const char *encoding,       /* encoding */
381    const char *errors          /* error handling */
382    );
383
384/* Encodes a Unicode object and returns the result as Python string
385   object. */
386
387extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
388    PyObject *unicode,	 	/* Unicode object */
389    const char *encoding,	/* encoding */
390    const char *errors		/* error handling */
391    );
392
393/* --- UTF-8 Codecs ------------------------------------------------------- */
394
395extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
396    const char *string, 	/* UTF-8 encoded string */
397    int length,	 		/* size of string */
398    const char *errors		/* error handling */
399    );
400
401extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
402    PyObject *unicode	 	/* Unicode object */
403    );
404
405extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
406    const Py_UNICODE *data, 	/* Unicode char buffer */
407    int length,	 		/* number of Py_UNICODE chars to encode */
408    const char *errors		/* error handling */
409    );
410
411/* --- UTF-16 Codecs ------------------------------------------------------ */
412
413/* Decodes length bytes from a UTF-16 encoded buffer string and returns
414   the corresponding Unicode object.
415
416   errors (if non-NULL) defines the error handling. It defaults
417   to "strict".
418
419   If byteorder is non-NULL, the decoder starts decoding using the
420   given byte order:
421
422	*byteorder == -1: little endian
423	*byteorder == 0:  native order
424	*byteorder == 1:  big endian
425
426   and then switches according to all BOM marks it finds in the input
427   data. BOM marks are not copied into the resulting Unicode string.
428   After completion, *byteorder is set to the current byte order at
429   the end of input data.
430
431   If byteorder is NULL, the codec starts in native order mode.
432
433*/
434
435extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
436    const char *string, 	/* UTF-16 encoded string */
437    int length,	 		/* size of string */
438    const char *errors,		/* error handling */
439    int *byteorder		/* pointer to byteorder to use
440				   0=native;-1=LE,1=BE; updated on
441				   exit */
442    );
443
444/* Returns a Python string using the UTF-16 encoding in native byte
445   order. The string always starts with a BOM mark.  */
446
447extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
448    PyObject *unicode	 	/* Unicode object */
449    );
450
451/* Returns a Python string object holding the UTF-16 encoded value of
452   the Unicode data.
453
454   If byteorder is not 0, output is written according to the following
455   byte order:
456
457   byteorder == -1: little endian
458   byteorder == 0:  native byte order (writes a BOM mark)
459   byteorder == 1:  big endian
460
461   If byteorder is 0, the output string will always start with the
462   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
463   prepended.
464
465   Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
466   UCS-2. This trick makes it possible to add full UTF-16 capabilities
467   at a later point without comprimising the APIs.
468
469*/
470
471extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
472    const Py_UNICODE *data, 	/* Unicode char buffer */
473    int length,	 		/* number of Py_UNICODE chars to encode */
474    const char *errors,		/* error handling */
475    int byteorder		/* byteorder to use 0=BOM+native;-1=LE,1=BE */
476    );
477
478/* --- Unicode-Escape Codecs ---------------------------------------------- */
479
480extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
481    const char *string, 	/* Unicode-Escape encoded string */
482    int length,	 		/* size of string */
483    const char *errors		/* error handling */
484    );
485
486extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
487    PyObject *unicode	 	/* Unicode object */
488    );
489
490extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
491    const Py_UNICODE *data, 	/* Unicode char buffer */
492    int length	 		/* Number of Py_UNICODE chars to encode */
493    );
494
495/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
496
497extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
498    const char *string, 	/* Raw-Unicode-Escape encoded string */
499    int length,	 		/* size of string */
500    const char *errors		/* error handling */
501    );
502
503extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
504    PyObject *unicode	 	/* Unicode object */
505    );
506
507extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
508    const Py_UNICODE *data, 	/* Unicode char buffer */
509    int length	 		/* Number of Py_UNICODE chars to encode */
510    );
511
512/* --- Latin-1 Codecs -----------------------------------------------------
513
514   Note: Latin-1 corresponds to the first 256 Unicode ordinals.
515
516*/
517
518extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
519    const char *string, 	/* Latin-1 encoded string */
520    int length,	 		/* size of string */
521    const char *errors		/* error handling */
522    );
523
524extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
525    PyObject *unicode	 	/* Unicode object */
526    );
527
528extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
529    const Py_UNICODE *data, 	/* Unicode char buffer */
530    int length,	 		/* Number of Py_UNICODE chars to encode */
531    const char *errors		/* error handling */
532    );
533
534/* --- ASCII Codecs -------------------------------------------------------
535
536   Only 7-bit ASCII data is excepted. All other codes generate errors.
537
538*/
539
540extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
541    const char *string, 	/* ASCII encoded string */
542    int length,	 		/* size of string */
543    const char *errors		/* error handling */
544    );
545
546extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
547    PyObject *unicode	 	/* Unicode object */
548    );
549
550extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
551    const Py_UNICODE *data, 	/* Unicode char buffer */
552    int length,	 		/* Number of Py_UNICODE chars to encode */
553    const char *errors		/* error handling */
554    );
555
556/* --- Character Map Codecs -----------------------------------------------
557
558   This codec uses mappings to encode and decode characters.
559
560   Decoding mappings must map single string characters to single
561   Unicode characters, integers (which are then interpreted as Unicode
562   ordinals) or None (meaning "undefined mapping" and causing an
563   error).
564
565   Encoding mappings must map single Unicode characters to single
566   string characters, integers (which are then interpreted as Latin-1
567   ordinals) or None (meaning "undefined mapping" and causing an
568   error).
569
570   If a character lookup fails with a LookupError, the character is
571   copied as-is meaning that its ordinal value will be interpreted as
572   Unicode or Latin-1 ordinal resp. Because of this mappings only need
573   to contain those mappings which map characters to different code
574   points.
575
576*/
577
578extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
579    const char *string, 	/* Encoded string */
580    int length,	 		/* size of string */
581    PyObject *mapping,		/* character mapping
582				   (char ordinal -> unicode ordinal) */
583    const char *errors		/* error handling */
584    );
585
586extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
587    PyObject *unicode,	 	/* Unicode object */
588    PyObject *mapping		/* character mapping
589				   (unicode ordinal -> char ordinal) */
590    );
591
592extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
593    const Py_UNICODE *data, 	/* Unicode char buffer */
594    int length,	 		/* Number of Py_UNICODE chars to encode */
595    PyObject *mapping,		/* character mapping
596				   (unicode ordinal -> char ordinal) */
597    const char *errors		/* error handling */
598    );
599
600/* Translate a Py_UNICODE buffer of the given length by applying a
601   character mapping table to it and return the resulting Unicode
602   object.
603
604   The mapping table must map Unicode ordinal integers to Unicode
605   ordinal integers or None (causing deletion of the character).
606
607   Mapping tables may be dictionaries or sequences. Unmapped character
608   ordinals (ones which cause a LookupError) are left untouched and
609   are copied as-is.
610
611*/
612
613extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
614    const Py_UNICODE *data, 	/* Unicode char buffer */
615    int length,	 		/* Number of Py_UNICODE chars to encode */
616    PyObject *table,		/* Translate table */
617    const char *errors		/* error handling */
618    );
619
620#ifdef MS_WIN32
621
622/* --- MBCS codecs for Windows -------------------------------------------- */
623
624extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
625    const char *string,         /* MBCS encoded string */
626    int length,                 /* size of string */
627    const char *errors          /* error handling */
628    );
629
630extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
631    PyObject *unicode           /* Unicode object */
632    );
633
634extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
635    const Py_UNICODE *data,     /* Unicode char buffer */
636    int length,                 /* Number of Py_UNICODE chars to encode */
637    const char *errors          /* error handling */
638    );
639
640#endif /* MS_WIN32 */
641
642/* --- Decimal Encoder ---------------------------------------------------- */
643
644/* Takes a Unicode string holding a decimal value and writes it into
645   an output buffer using standard ASCII digit codes.
646
647   The output buffer has to provide at least length+1 bytes of storage
648   area. The output string is 0-terminated.
649
650   The encoder converts whitespace to ' ', decimal characters to their
651   corresponding ASCII digit and all other Latin-1 characters except
652   \0 as-is. Characters outside this range (Unicode ordinals 1-256)
653   are treated as errors. This includes embedded NULL bytes.
654
655   Error handling is defined by the errors argument:
656
657      NULL or "strict": raise a ValueError
658      "ignore": ignore the wrong characters (these are not copied to the
659		output buffer)
660      "replace": replaces illegal characters with '?'
661
662   Returns 0 on success, -1 on failure.
663
664*/
665
666extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
667    Py_UNICODE *s,		/* Unicode buffer */
668    int length,			/* Number of Py_UNICODE chars to encode */
669    char *output,		/* Output buffer; must have size >= length */
670    const char *errors		/* error handling */
671    );
672
673/* --- Methods & Slots ----------------------------------------------------
674
675   These are capable of handling Unicode objects and strings on input
676   (we refer to them as strings in the descriptions) and return
677   Unicode objects or integers as apporpriate. */
678
679/* Concat two strings giving a new Unicode string. */
680
681extern DL_IMPORT(PyObject*) PyUnicode_Concat(
682    PyObject *left,	 	/* Left string */
683    PyObject *right	 	/* Right string */
684    );
685
686/* Split a string giving a list of Unicode strings.
687
688   If sep is NULL, splitting will be done at all whitespace
689   substrings. Otherwise, splits occur at the given separator.
690
691   At most maxsplit splits will be done. If negative, no limit is set.
692
693   Separators are not included in the resulting list.
694
695*/
696
697extern DL_IMPORT(PyObject*) PyUnicode_Split(
698    PyObject *s,		/* String to split */
699    PyObject *sep,		/* String separator */
700    int maxsplit		/* Maxsplit count */
701    );
702
703/* Dito, but split at line breaks.
704
705   CRLF is considered to be one line break. Line breaks are not
706   included in the resulting list. */
707
708extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
709    PyObject *s,		/* String to split */
710    int keepends		/* If true, line end markers are included */
711    );
712
713/* Translate a string by applying a character mapping table to it and
714   return the resulting Unicode object.
715
716   The mapping table must map Unicode ordinal integers to Unicode
717   ordinal integers or None (causing deletion of the character).
718
719   Mapping tables may be dictionaries or sequences. Unmapped character
720   ordinals (ones which cause a LookupError) are left untouched and
721   are copied as-is.
722
723*/
724
725extern DL_IMPORT(PyObject *) PyUnicode_Translate(
726    PyObject *str,		/* String */
727    PyObject *table,		/* Translate table */
728    const char *errors		/* error handling */
729    );
730
731/* Join a sequence of strings using the given separator and return
732   the resulting Unicode string. */
733
734extern DL_IMPORT(PyObject*) PyUnicode_Join(
735    PyObject *separator, 	/* Separator string */
736    PyObject *seq	 	/* Sequence object */
737    );
738
739/* Return 1 if substr matches str[start:end] at the given tail end, 0
740   otherwise. */
741
742extern DL_IMPORT(int) PyUnicode_Tailmatch(
743    PyObject *str,		/* String */
744    PyObject *substr,		/* Prefix or Suffix string */
745    int start,			/* Start index */
746    int end,			/* Stop index */
747    int direction		/* Tail end: -1 prefix, +1 suffix */
748    );
749
750/* Return the first position of substr in str[start:end] using the
751   given search direction or -1 if not found. */
752
753extern DL_IMPORT(int) PyUnicode_Find(
754    PyObject *str,		/* String */
755    PyObject *substr,		/* Substring to find */
756    int start,			/* Start index */
757    int end,			/* Stop index */
758    int direction		/* Find direction: +1 forward, -1 backward */
759    );
760
761/* Count the number of occurrences of substr in str[start:end]. */
762
763extern DL_IMPORT(int) PyUnicode_Count(
764    PyObject *str,		/* String */
765    PyObject *substr,		/* Substring to count */
766    int start,			/* Start index */
767    int end			/* Stop index */
768    );
769
770/* Replace at most maxcount occurrences of substr in str with replstr
771   and return the resulting Unicode object. */
772
773extern DL_IMPORT(PyObject *) PyUnicode_Replace(
774    PyObject *str,		/* String */
775    PyObject *substr,		/* Substring to find */
776    PyObject *replstr,		/* Substring to replace */
777    int maxcount		/* Max. number of replacements to apply;
778				   -1 = all */
779    );
780
781/* Compare two strings and return -1, 0, 1 for less than, equal,
782   greater than resp. */
783
784extern DL_IMPORT(int) PyUnicode_Compare(
785    PyObject *left,		/* Left string */
786    PyObject *right		/* Right string */
787    );
788
789/* Apply a argument tuple or dictionar to a format string and return
790   the resulting Unicode string. */
791
792extern DL_IMPORT(PyObject *) PyUnicode_Format(
793    PyObject *format,		/* Format string */
794    PyObject *args		/* Argument tuple or dictionary */
795    );
796
797/* Checks whether element is contained in container and return 1/0
798   accordingly.
799
800   element has to coerce to an one element Unicode string. -1 is
801   returned in case of an error. */
802
803extern DL_IMPORT(int) PyUnicode_Contains(
804    PyObject *container,	/* Container string */
805    PyObject *element		/* Element string */
806    );
807
808/* === Characters Type APIs =============================================== */
809
810/* These should not be used directly. Use the Py_UNICODE_IS* and
811   Py_UNICODE_TO* macros instead.
812
813   These APIs are implemented in Objects/unicodectype.c.
814
815*/
816
817extern DL_IMPORT(int) _PyUnicode_IsLowercase(
818    register const Py_UNICODE ch 	/* Unicode character */
819    );
820
821extern DL_IMPORT(int) _PyUnicode_IsUppercase(
822    register const Py_UNICODE ch 	/* Unicode character */
823    );
824
825extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
826    register const Py_UNICODE ch 	/* Unicode character */
827    );
828
829extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
830    register const Py_UNICODE ch 	/* Unicode character */
831    );
832
833extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
834    register const Py_UNICODE ch 	/* Unicode character */
835    );
836
837extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
838    register const Py_UNICODE ch 	/* Unicode character */
839    );
840
841extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
842    register const Py_UNICODE ch 	/* Unicode character */
843    );
844
845extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
846    register const Py_UNICODE ch 	/* Unicode character */
847    );
848
849extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
850    register const Py_UNICODE ch 	/* Unicode character */
851    );
852
853extern DL_IMPORT(int) _PyUnicode_ToDigit(
854    register const Py_UNICODE ch 	/* Unicode character */
855    );
856
857extern DL_IMPORT(double) _PyUnicode_ToNumeric(
858    register const Py_UNICODE ch 	/* Unicode character */
859    );
860
861extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
862    register const Py_UNICODE ch 	/* Unicode character */
863    );
864
865extern DL_IMPORT(int) _PyUnicode_IsDigit(
866    register const Py_UNICODE ch 	/* Unicode character */
867    );
868
869extern DL_IMPORT(int) _PyUnicode_IsNumeric(
870    register const Py_UNICODE ch 	/* Unicode character */
871    );
872
873extern DL_IMPORT(int) _PyUnicode_IsAlpha(
874    register const Py_UNICODE ch 	/* Unicode character */
875    );
876
877#ifdef __cplusplus
878}
879#endif
880#endif /* !Py_UNICODEOBJECT_H */
881