unicodedata.c revision 7b7dd107b3654926fb75215805d6c0c8a15bf89e
1/* ------------------------------------------------------------------------
2
3   unicodedata -- Provides access to the Unicode 3.0 data base.
4
5   Data was extracted from the Unicode 3.0 UnicodeData.txt file.
6
7   Written by Marc-Andre Lemburg (mal@lemburg.com).
8   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9
10   Copyright (c) Corporation for National Research Initiatives.
11
12   ------------------------------------------------------------------------ */
13
14#include "Python.h"
15#include "unicodedatabase.h"
16
17typedef struct {
18    const unsigned char category;	/* index into
19					   _PyUnicode_CategoryNames */
20    const unsigned char	combining; 	/* combining class value 0 - 255 */
21    const unsigned char	bidirectional; 	/* index into
22					   _PyUnicode_BidirectionalNames */
23    const unsigned char mirrored;	/* true if mirrored in bidir mode */
24} _PyUnicode_DatabaseRecord;
25
26/* data file generated by Tools/unicode/makeunicodedata.py */
27#include "unicodedata_db.h"
28
29static const _PyUnicode_DatabaseRecord*
30getrecord(PyUnicodeObject* v)
31{
32    int code;
33    int index;
34
35    code = (int) *PyUnicode_AS_UNICODE(v);
36
37    if (code < 0 || code >= 65536)
38        index = 0;
39    else {
40        index = index1[(code>>SHIFT)];
41        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
42    }
43
44    return &_PyUnicode_Database_Records[index];
45}
46
47/* --- Module API --------------------------------------------------------- */
48
49static PyObject *
50unicodedata_decimal(PyObject *self, PyObject *args)
51{
52    PyUnicodeObject *v;
53    PyObject *defobj = NULL;
54    long rc;
55
56    if (!PyArg_ParseTuple(args, "O!|O:decimal",
57			  &PyUnicode_Type, &v, &defobj))
58        return NULL;
59    if (PyUnicode_GET_SIZE(v) != 1) {
60	PyErr_SetString(PyExc_TypeError,
61			"need a single Unicode character as parameter");
62        return NULL;
63    }
64    rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
65    if (rc < 0) {
66	if (defobj == NULL) {
67	    PyErr_SetString(PyExc_ValueError,
68			    "not a decimal");
69            return NULL;
70	}
71	else {
72	    Py_INCREF(defobj);
73	    return defobj;
74	}
75    }
76    return PyInt_FromLong(rc);
77}
78
79static PyObject *
80unicodedata_digit(PyObject *self, PyObject *args)
81{
82    PyUnicodeObject *v;
83    PyObject *defobj = NULL;
84    long rc;
85
86    if (!PyArg_ParseTuple(args, "O!|O:digit",
87			  &PyUnicode_Type, &v, &defobj))
88        return NULL;
89    if (PyUnicode_GET_SIZE(v) != 1) {
90	PyErr_SetString(PyExc_TypeError,
91			"need a single Unicode character as parameter");
92        return NULL;
93    }
94    rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
95    if (rc < 0) {
96	if (defobj == NULL) {
97	    PyErr_SetString(PyExc_ValueError,
98			    "not a digit");
99            return NULL;
100	}
101	else {
102	    Py_INCREF(defobj);
103	    return defobj;
104	}
105    }
106    return PyInt_FromLong(rc);
107}
108
109static PyObject *
110unicodedata_numeric(PyObject *self, PyObject *args)
111{
112    PyUnicodeObject *v;
113    PyObject *defobj = NULL;
114    double rc;
115
116    if (!PyArg_ParseTuple(args, "O!|O:numeric",
117			  &PyUnicode_Type, &v, &defobj))
118        return NULL;
119    if (PyUnicode_GET_SIZE(v) != 1) {
120	PyErr_SetString(PyExc_TypeError,
121			"need a single Unicode character as parameter");
122	return NULL;
123    }
124    rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
125    if (rc < 0) {
126	if (defobj == NULL) {
127	    PyErr_SetString(PyExc_ValueError,
128			    "not a numeric character");
129	    return NULL;
130	}
131	else {
132	    Py_INCREF(defobj);
133	    return defobj;
134	}
135    }
136    return PyFloat_FromDouble(rc);
137}
138
139static PyObject *
140unicodedata_category(PyObject *self, PyObject *args)
141{
142    PyUnicodeObject *v;
143    int index;
144
145    if (!PyArg_ParseTuple(args, "O!:category",
146			  &PyUnicode_Type, &v))
147	return NULL;
148    if (PyUnicode_GET_SIZE(v) != 1) {
149	PyErr_SetString(PyExc_TypeError,
150			"need a single Unicode character as parameter");
151	return NULL;
152    }
153    index = (int) getrecord(v)->category;
154    return PyString_FromString(_PyUnicode_CategoryNames[index]);
155}
156
157static PyObject *
158unicodedata_bidirectional(PyObject *self, PyObject *args)
159{
160    PyUnicodeObject *v;
161    int index;
162
163    if (!PyArg_ParseTuple(args, "O!:bidirectional",
164			  &PyUnicode_Type, &v))
165	return NULL;
166    if (PyUnicode_GET_SIZE(v) != 1) {
167	PyErr_SetString(PyExc_TypeError,
168			"need a single Unicode character as parameter");
169	return NULL;
170    }
171    index = (int) getrecord(v)->bidirectional;
172    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
173}
174
175static PyObject *
176unicodedata_combining(PyObject *self, PyObject *args)
177{
178    PyUnicodeObject *v;
179
180    if (!PyArg_ParseTuple(args, "O!:combining",
181			  &PyUnicode_Type, &v))
182	return NULL;
183    if (PyUnicode_GET_SIZE(v) != 1) {
184	PyErr_SetString(PyExc_TypeError,
185			"need a single Unicode character as parameter");
186	return NULL;
187    }
188    return PyInt_FromLong((int) getrecord(v)->combining);
189}
190
191static PyObject *
192unicodedata_mirrored(PyObject *self, PyObject *args)
193{
194    PyUnicodeObject *v;
195
196    if (!PyArg_ParseTuple(args, "O!:mirrored",
197			  &PyUnicode_Type, &v))
198	return NULL;
199    if (PyUnicode_GET_SIZE(v) != 1) {
200	PyErr_SetString(PyExc_TypeError,
201			"need a single Unicode character as parameter");
202	return NULL;
203    }
204    return PyInt_FromLong((int) getrecord(v)->mirrored);
205}
206
207static PyObject *
208unicodedata_decomposition(PyObject *self, PyObject *args)
209{
210    PyUnicodeObject *v;
211    char decomp[256];
212    int code, index, count, i;
213
214    if (!PyArg_ParseTuple(args, "O!:decomposition",
215			  &PyUnicode_Type, &v))
216	return NULL;
217    if (PyUnicode_GET_SIZE(v) != 1) {
218	PyErr_SetString(PyExc_TypeError,
219			"need a single Unicode character as parameter");
220	return NULL;
221    }
222
223    code = (int) *PyUnicode_AS_UNICODE(v);
224
225    if (code < 0 || code >= 65536)
226        index = 0;
227    else {
228        index = decomp_index1[(code>>DECOMP_SHIFT)];
229        index = decomp_index2[(index<<DECOMP_SHIFT)+
230                             (code&((1<<DECOMP_SHIFT)-1))];
231    }
232
233    /* high byte is of hex bytes (usually one or two), low byte
234       is prefix code (from*/
235    count = decomp_data[index] >> 8;
236
237    /* XXX: could allocate the PyString up front instead
238       (strlen(prefix) + 5 * count + 1 bytes) */
239
240    /* copy prefix */
241    i = strlen(decomp_prefix[decomp_data[index] & 255]);
242    memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
243
244    while (count-- > 0) {
245        if (i)
246            decomp[i++] = ' ';
247        sprintf(decomp + i, "%04X", decomp_data[++index]);
248        i += strlen(decomp + i);
249    }
250
251    decomp[i] = '\0';
252
253    return PyString_FromString(decomp);
254}
255
256/* XXX Add doc strings. */
257
258static PyMethodDef unicodedata_functions[] = {
259    {"decimal",		unicodedata_decimal,			1},
260    {"digit",		unicodedata_digit,			1},
261    {"numeric",		unicodedata_numeric,			1},
262    {"category",	unicodedata_category,			1},
263    {"bidirectional",	unicodedata_bidirectional,		1},
264    {"combining",	unicodedata_combining,			1},
265    {"mirrored",	unicodedata_mirrored,			1},
266    {"decomposition",	unicodedata_decomposition,		1},
267    {NULL, NULL}		/* sentinel */
268};
269
270DL_EXPORT(void)
271initunicodedata(void)
272{
273    Py_InitModule("unicodedata", unicodedata_functions);
274}
275