unicodedata.c revision f585bef5043972b042cb324e75c82ee07fd56fa2
1/* ------------------------------------------------------------------------
2
3   unicodedata -- Provides access to the Unicode 3.0 data base.
4
5   Data was extracted from the Unicode 3.0 UnicodeData.txt file.
6
7   Written by Marc-Andre Lemburg (mal@lemburg.com).
8   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9
10   Copyright (c) Corporation for National Research Initiatives.
11
12   ------------------------------------------------------------------------ */
13
14#include "Python.h"
15#include "ucnhash.h"
16
17/* character properties */
18
19typedef struct {
20    const unsigned char category;	/* index into
21					   _PyUnicode_CategoryNames */
22    const unsigned char	combining; 	/* combining class value 0 - 255 */
23    const unsigned char	bidirectional; 	/* index into
24					   _PyUnicode_BidirectionalNames */
25    const unsigned char mirrored;	/* true if mirrored in bidir mode */
26} _PyUnicode_DatabaseRecord;
27
28/* data file generated by Tools/unicode/makeunicodedata.py */
29#include "unicodedata_db.h"
30
31static const _PyUnicode_DatabaseRecord*
32_getrecord(PyUnicodeObject* v)
33{
34    int code;
35    int index;
36
37    code = (int) *PyUnicode_AS_UNICODE(v);
38
39    if (code < 0 || code >= 65536)
40        index = 0;
41    else {
42        index = index1[(code>>SHIFT)];
43        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
44    }
45
46    return &_PyUnicode_Database_Records[index];
47}
48
49/* --- Module API --------------------------------------------------------- */
50
51static PyObject *
52unicodedata_decimal(PyObject *self, PyObject *args)
53{
54    PyUnicodeObject *v;
55    PyObject *defobj = NULL;
56    long rc;
57
58    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
59        return NULL;
60    if (PyUnicode_GET_SIZE(v) != 1) {
61	PyErr_SetString(PyExc_TypeError,
62			"need a single Unicode character as parameter");
63        return NULL;
64    }
65    rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
66    if (rc < 0) {
67	if (defobj == NULL) {
68	    PyErr_SetString(PyExc_ValueError,
69			    "not a decimal");
70            return NULL;
71	}
72	else {
73	    Py_INCREF(defobj);
74	    return defobj;
75	}
76    }
77    return PyInt_FromLong(rc);
78}
79
80static PyObject *
81unicodedata_digit(PyObject *self, PyObject *args)
82{
83    PyUnicodeObject *v;
84    PyObject *defobj = NULL;
85    long rc;
86
87    if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
88        return NULL;
89    if (PyUnicode_GET_SIZE(v) != 1) {
90	PyErr_SetString(PyExc_TypeError,
91			"need a single Unicode character as parameter");
92        return NULL;
93    }
94    rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
95    if (rc < 0) {
96	if (defobj == NULL) {
97	    PyErr_SetString(PyExc_ValueError, "not a digit");
98            return NULL;
99	}
100	else {
101	    Py_INCREF(defobj);
102	    return defobj;
103	}
104    }
105    return PyInt_FromLong(rc);
106}
107
108static PyObject *
109unicodedata_numeric(PyObject *self, PyObject *args)
110{
111    PyUnicodeObject *v;
112    PyObject *defobj = NULL;
113    double rc;
114
115    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
116        return NULL;
117    if (PyUnicode_GET_SIZE(v) != 1) {
118	PyErr_SetString(PyExc_TypeError,
119			"need a single Unicode character as parameter");
120	return NULL;
121    }
122    rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
123    if (rc < 0) {
124	if (defobj == NULL) {
125	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
126	    return NULL;
127	}
128	else {
129	    Py_INCREF(defobj);
130	    return defobj;
131	}
132    }
133    return PyFloat_FromDouble(rc);
134}
135
136static PyObject *
137unicodedata_category(PyObject *self, PyObject *args)
138{
139    PyUnicodeObject *v;
140    int index;
141
142    if (!PyArg_ParseTuple(args, "O!:category",
143			  &PyUnicode_Type, &v))
144	return NULL;
145    if (PyUnicode_GET_SIZE(v) != 1) {
146	PyErr_SetString(PyExc_TypeError,
147			"need a single Unicode character as parameter");
148	return NULL;
149    }
150    index = (int) _getrecord(v)->category;
151    return PyString_FromString(_PyUnicode_CategoryNames[index]);
152}
153
154static PyObject *
155unicodedata_bidirectional(PyObject *self, PyObject *args)
156{
157    PyUnicodeObject *v;
158    int index;
159
160    if (!PyArg_ParseTuple(args, "O!:bidirectional",
161			  &PyUnicode_Type, &v))
162	return NULL;
163    if (PyUnicode_GET_SIZE(v) != 1) {
164	PyErr_SetString(PyExc_TypeError,
165			"need a single Unicode character as parameter");
166	return NULL;
167    }
168    index = (int) _getrecord(v)->bidirectional;
169    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
170}
171
172static PyObject *
173unicodedata_combining(PyObject *self, PyObject *args)
174{
175    PyUnicodeObject *v;
176
177    if (!PyArg_ParseTuple(args, "O!:combining",
178			  &PyUnicode_Type, &v))
179	return NULL;
180    if (PyUnicode_GET_SIZE(v) != 1) {
181	PyErr_SetString(PyExc_TypeError,
182			"need a single Unicode character as parameter");
183	return NULL;
184    }
185    return PyInt_FromLong((int) _getrecord(v)->combining);
186}
187
188static PyObject *
189unicodedata_mirrored(PyObject *self, PyObject *args)
190{
191    PyUnicodeObject *v;
192
193    if (!PyArg_ParseTuple(args, "O!:mirrored",
194			  &PyUnicode_Type, &v))
195	return NULL;
196    if (PyUnicode_GET_SIZE(v) != 1) {
197	PyErr_SetString(PyExc_TypeError,
198			"need a single Unicode character as parameter");
199	return NULL;
200    }
201    return PyInt_FromLong((int) _getrecord(v)->mirrored);
202}
203
204static PyObject *
205unicodedata_decomposition(PyObject *self, PyObject *args)
206{
207    PyUnicodeObject *v;
208    char decomp[256];
209    int code, index, count, i;
210
211    if (!PyArg_ParseTuple(args, "O!:decomposition",
212			  &PyUnicode_Type, &v))
213	return NULL;
214    if (PyUnicode_GET_SIZE(v) != 1) {
215	PyErr_SetString(PyExc_TypeError,
216			"need a single Unicode character as parameter");
217	return NULL;
218    }
219
220    code = (int) *PyUnicode_AS_UNICODE(v);
221
222    if (code < 0 || code >= 65536)
223        index = 0;
224    else {
225        index = decomp_index1[(code>>DECOMP_SHIFT)];
226        index = decomp_index2[(index<<DECOMP_SHIFT)+
227                             (code&((1<<DECOMP_SHIFT)-1))];
228    }
229
230    /* high byte is of hex bytes (usually one or two), low byte
231       is prefix code (from*/
232    count = decomp_data[index] >> 8;
233
234    /* XXX: could allocate the PyString up front instead
235       (strlen(prefix) + 5 * count + 1 bytes) */
236
237    /* copy prefix */
238    i = strlen(decomp_prefix[decomp_data[index] & 255]);
239    memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
240
241    while (count-- > 0) {
242        if (i)
243            decomp[i++] = ' ';
244        sprintf(decomp + i, "%04X", decomp_data[++index]);
245        i += strlen(decomp + i);
246    }
247
248    decomp[i] = '\0';
249
250    return PyString_FromString(decomp);
251}
252
253/* -------------------------------------------------------------------- */
254/* unicode character name tables */
255
256/* data file generated by Tools/unicode/makeunicodedata.py */
257#include "unicodename_db.h"
258
259/* -------------------------------------------------------------------- */
260/* database code (cut and pasted from the unidb package) */
261
262static unsigned long
263_gethash(const char *s, int len, int scale)
264{
265    int i;
266    unsigned long h = 0;
267    unsigned long ix;
268    for (i = 0; i < len; i++) {
269        h = (h * scale) + (unsigned char) toupper(s[i]);
270        ix = h & 0xff000000;
271        if (ix)
272            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
273    }
274    return h;
275}
276
277static int
278_getname(Py_UCS4 code, char* buffer, int buflen)
279{
280    int offset;
281    int i;
282    int word;
283    unsigned char* w;
284
285    if (code < 0 || code >= 65536)
286        return 0;
287
288    /* get offset into phrasebook */
289    offset = phrasebook_offset1[(code>>phrasebook_shift)];
290    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
291                               (code&((1<<phrasebook_shift)-1))];
292    if (!offset)
293        return 0;
294
295    i = 0;
296
297    for (;;) {
298        /* get word index */
299        word = phrasebook[offset] - phrasebook_short;
300        if (word >= 0) {
301            word = (word << 8) + phrasebook[offset+1];
302            offset += 2;
303        } else
304            word = phrasebook[offset++];
305        if (i) {
306            if (i > buflen)
307                return 0; /* buffer overflow */
308            buffer[i++] = ' ';
309        }
310        /* copy word string from lexicon.  the last character in the
311           word has bit 7 set.  the last word in a string ends with
312           0x80 */
313        w = lexicon + lexicon_offset[word];
314        while (*w < 128) {
315            if (i >= buflen)
316                return 0; /* buffer overflow */
317            buffer[i++] = *w++;
318        }
319        if (i >= buflen)
320            return 0; /* buffer overflow */
321        buffer[i++] = *w & 127;
322        if (*w == 128)
323            break; /* end of word */
324    }
325
326    return 1;
327}
328
329static int
330_cmpname(int code, const char* name, int namelen)
331{
332    /* check if code corresponds to the given name */
333    int i;
334    char buffer[NAME_MAXLEN];
335    if (!_getname(code, buffer, sizeof(buffer)))
336        return 0;
337    for (i = 0; i < namelen; i++) {
338        if (toupper(name[i]) != buffer[i])
339            return 0;
340    }
341    return buffer[namelen] == '\0';
342}
343
344static int
345_getcode(const char* name, int namelen, Py_UCS4* code)
346{
347    unsigned int h, v;
348    unsigned int mask = code_size-1;
349    unsigned int i, incr;
350
351    /* the following is the same as python's dictionary lookup, with
352       only minor changes.  see the makeunicodedata script for more
353       details */
354
355    h = (unsigned int) _gethash(name, namelen, code_magic);
356    i = (~h) & mask;
357    v = code_hash[i];
358    if (!v)
359        return 0;
360    if (_cmpname(v, name, namelen)) {
361        *code = v;
362        return 1;
363    }
364    incr = (h ^ (h >> 3)) & mask;
365    if (!incr)
366        incr = mask;
367    for (;;) {
368        i = (i + incr) & mask;
369        v = code_hash[i];
370        if (!v)
371            return 0;
372        if (_cmpname(v, name, namelen)) {
373            *code = v;
374            return 1;
375        }
376        incr = incr << 1;
377        if (incr > mask)
378            incr = incr ^ code_poly;
379    }
380}
381
382static const _PyUnicode_Name_CAPI hashAPI =
383{
384    sizeof(_PyUnicode_Name_CAPI),
385    _getname,
386    _getcode
387};
388
389/* -------------------------------------------------------------------- */
390/* Python bindings */
391
392static PyObject *
393unicodedata_name(PyObject* self, PyObject* args)
394{
395    char name[NAME_MAXLEN];
396
397    PyUnicodeObject* v;
398    PyObject* defobj = NULL;
399    if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
400        return NULL;
401
402    if (PyUnicode_GET_SIZE(v) != 1) {
403	PyErr_SetString(PyExc_TypeError,
404			"need a single Unicode character as parameter");
405	return NULL;
406    }
407
408    if (!_getname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
409                             name, sizeof(name))) {
410	if (defobj == NULL) {
411	    PyErr_SetString(PyExc_ValueError, "no such name");
412            return NULL;
413	}
414	else {
415	    Py_INCREF(defobj);
416	    return defobj;
417	}
418    }
419
420    return Py_BuildValue("s", name);
421}
422
423static PyObject *
424unicodedata_lookup(PyObject* self, PyObject* args)
425{
426    Py_UCS4 code;
427    Py_UNICODE str[1];
428
429    char* name;
430    int namelen;
431    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
432        return NULL;
433
434    if (!_getcode(name, namelen, &code)) {
435        PyErr_SetString(PyExc_KeyError, "undefined character name");
436        return NULL;
437    }
438
439    str[0] = (Py_UNICODE) code;
440    return PyUnicode_FromUnicode(str, 1);
441}
442
443/* XXX Add doc strings. */
444
445static PyMethodDef unicodedata_functions[] = {
446    {"decimal", unicodedata_decimal, METH_VARARGS},
447    {"digit", unicodedata_digit, METH_VARARGS},
448    {"numeric", unicodedata_numeric, METH_VARARGS},
449    {"category", unicodedata_category, METH_VARARGS},
450    {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
451    {"combining", unicodedata_combining, METH_VARARGS},
452    {"mirrored", unicodedata_mirrored, METH_VARARGS},
453    {"decomposition",unicodedata_decomposition, METH_VARARGS},
454    {"name", unicodedata_name, METH_VARARGS},
455    {"lookup", unicodedata_lookup, METH_VARARGS},
456    {NULL, NULL}		/* sentinel */
457};
458
459static char *unicodedata_docstring = "unicode character database";
460
461DL_EXPORT(void)
462initunicodedata(void)
463{
464    PyObject *m, *d, *v;
465
466    m = Py_InitModule3(
467        "unicodedata", unicodedata_functions, unicodedata_docstring);
468    if (!m)
469        return;
470
471    d = PyModule_GetDict(m);
472    if (!d)
473        return;
474
475    /* Export C API */
476    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
477    if (v != NULL) {
478        PyDict_SetItemString(d, "ucnhash_CAPI", v);
479        Py_DECREF(v);
480    }
481}
482