unicodedata.c revision f585bef5043972b042cb324e75c82ee07fd56fa2
1/* ------------------------------------------------------------------------ 2 3 unicodedata -- Provides access to the Unicode 3.0 data base. 4 5 Data was extracted from the Unicode 3.0 UnicodeData.txt file. 6 7 Written by Marc-Andre Lemburg (mal@lemburg.com). 8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) 9 10 Copyright (c) Corporation for National Research Initiatives. 11 12 ------------------------------------------------------------------------ */ 13 14#include "Python.h" 15#include "ucnhash.h" 16 17/* character properties */ 18 19typedef struct { 20 const unsigned char category; /* index into 21 _PyUnicode_CategoryNames */ 22 const unsigned char combining; /* combining class value 0 - 255 */ 23 const unsigned char bidirectional; /* index into 24 _PyUnicode_BidirectionalNames */ 25 const unsigned char mirrored; /* true if mirrored in bidir mode */ 26} _PyUnicode_DatabaseRecord; 27 28/* data file generated by Tools/unicode/makeunicodedata.py */ 29#include "unicodedata_db.h" 30 31static const _PyUnicode_DatabaseRecord* 32_getrecord(PyUnicodeObject* v) 33{ 34 int code; 35 int index; 36 37 code = (int) *PyUnicode_AS_UNICODE(v); 38 39 if (code < 0 || code >= 65536) 40 index = 0; 41 else { 42 index = index1[(code>>SHIFT)]; 43 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 44 } 45 46 return &_PyUnicode_Database_Records[index]; 47} 48 49/* --- Module API --------------------------------------------------------- */ 50 51static PyObject * 52unicodedata_decimal(PyObject *self, PyObject *args) 53{ 54 PyUnicodeObject *v; 55 PyObject *defobj = NULL; 56 long rc; 57 58 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) 59 return NULL; 60 if (PyUnicode_GET_SIZE(v) != 1) { 61 PyErr_SetString(PyExc_TypeError, 62 "need a single Unicode character as parameter"); 63 return NULL; 64 } 65 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v)); 66 if (rc < 0) { 67 if (defobj == NULL) { 68 PyErr_SetString(PyExc_ValueError, 69 "not a decimal"); 70 return NULL; 71 } 72 else { 73 Py_INCREF(defobj); 74 return defobj; 75 } 76 } 77 return PyInt_FromLong(rc); 78} 79 80static PyObject * 81unicodedata_digit(PyObject *self, PyObject *args) 82{ 83 PyUnicodeObject *v; 84 PyObject *defobj = NULL; 85 long rc; 86 87 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) 88 return NULL; 89 if (PyUnicode_GET_SIZE(v) != 1) { 90 PyErr_SetString(PyExc_TypeError, 91 "need a single Unicode character as parameter"); 92 return NULL; 93 } 94 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v)); 95 if (rc < 0) { 96 if (defobj == NULL) { 97 PyErr_SetString(PyExc_ValueError, "not a digit"); 98 return NULL; 99 } 100 else { 101 Py_INCREF(defobj); 102 return defobj; 103 } 104 } 105 return PyInt_FromLong(rc); 106} 107 108static PyObject * 109unicodedata_numeric(PyObject *self, PyObject *args) 110{ 111 PyUnicodeObject *v; 112 PyObject *defobj = NULL; 113 double rc; 114 115 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) 116 return NULL; 117 if (PyUnicode_GET_SIZE(v) != 1) { 118 PyErr_SetString(PyExc_TypeError, 119 "need a single Unicode character as parameter"); 120 return NULL; 121 } 122 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v)); 123 if (rc < 0) { 124 if (defobj == NULL) { 125 PyErr_SetString(PyExc_ValueError, "not a numeric character"); 126 return NULL; 127 } 128 else { 129 Py_INCREF(defobj); 130 return defobj; 131 } 132 } 133 return PyFloat_FromDouble(rc); 134} 135 136static PyObject * 137unicodedata_category(PyObject *self, PyObject *args) 138{ 139 PyUnicodeObject *v; 140 int index; 141 142 if (!PyArg_ParseTuple(args, "O!:category", 143 &PyUnicode_Type, &v)) 144 return NULL; 145 if (PyUnicode_GET_SIZE(v) != 1) { 146 PyErr_SetString(PyExc_TypeError, 147 "need a single Unicode character as parameter"); 148 return NULL; 149 } 150 index = (int) _getrecord(v)->category; 151 return PyString_FromString(_PyUnicode_CategoryNames[index]); 152} 153 154static PyObject * 155unicodedata_bidirectional(PyObject *self, PyObject *args) 156{ 157 PyUnicodeObject *v; 158 int index; 159 160 if (!PyArg_ParseTuple(args, "O!:bidirectional", 161 &PyUnicode_Type, &v)) 162 return NULL; 163 if (PyUnicode_GET_SIZE(v) != 1) { 164 PyErr_SetString(PyExc_TypeError, 165 "need a single Unicode character as parameter"); 166 return NULL; 167 } 168 index = (int) _getrecord(v)->bidirectional; 169 return PyString_FromString(_PyUnicode_BidirectionalNames[index]); 170} 171 172static PyObject * 173unicodedata_combining(PyObject *self, PyObject *args) 174{ 175 PyUnicodeObject *v; 176 177 if (!PyArg_ParseTuple(args, "O!:combining", 178 &PyUnicode_Type, &v)) 179 return NULL; 180 if (PyUnicode_GET_SIZE(v) != 1) { 181 PyErr_SetString(PyExc_TypeError, 182 "need a single Unicode character as parameter"); 183 return NULL; 184 } 185 return PyInt_FromLong((int) _getrecord(v)->combining); 186} 187 188static PyObject * 189unicodedata_mirrored(PyObject *self, PyObject *args) 190{ 191 PyUnicodeObject *v; 192 193 if (!PyArg_ParseTuple(args, "O!:mirrored", 194 &PyUnicode_Type, &v)) 195 return NULL; 196 if (PyUnicode_GET_SIZE(v) != 1) { 197 PyErr_SetString(PyExc_TypeError, 198 "need a single Unicode character as parameter"); 199 return NULL; 200 } 201 return PyInt_FromLong((int) _getrecord(v)->mirrored); 202} 203 204static PyObject * 205unicodedata_decomposition(PyObject *self, PyObject *args) 206{ 207 PyUnicodeObject *v; 208 char decomp[256]; 209 int code, index, count, i; 210 211 if (!PyArg_ParseTuple(args, "O!:decomposition", 212 &PyUnicode_Type, &v)) 213 return NULL; 214 if (PyUnicode_GET_SIZE(v) != 1) { 215 PyErr_SetString(PyExc_TypeError, 216 "need a single Unicode character as parameter"); 217 return NULL; 218 } 219 220 code = (int) *PyUnicode_AS_UNICODE(v); 221 222 if (code < 0 || code >= 65536) 223 index = 0; 224 else { 225 index = decomp_index1[(code>>DECOMP_SHIFT)]; 226 index = decomp_index2[(index<<DECOMP_SHIFT)+ 227 (code&((1<<DECOMP_SHIFT)-1))]; 228 } 229 230 /* high byte is of hex bytes (usually one or two), low byte 231 is prefix code (from*/ 232 count = decomp_data[index] >> 8; 233 234 /* XXX: could allocate the PyString up front instead 235 (strlen(prefix) + 5 * count + 1 bytes) */ 236 237 /* copy prefix */ 238 i = strlen(decomp_prefix[decomp_data[index] & 255]); 239 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i); 240 241 while (count-- > 0) { 242 if (i) 243 decomp[i++] = ' '; 244 sprintf(decomp + i, "%04X", decomp_data[++index]); 245 i += strlen(decomp + i); 246 } 247 248 decomp[i] = '\0'; 249 250 return PyString_FromString(decomp); 251} 252 253/* -------------------------------------------------------------------- */ 254/* unicode character name tables */ 255 256/* data file generated by Tools/unicode/makeunicodedata.py */ 257#include "unicodename_db.h" 258 259/* -------------------------------------------------------------------- */ 260/* database code (cut and pasted from the unidb package) */ 261 262static unsigned long 263_gethash(const char *s, int len, int scale) 264{ 265 int i; 266 unsigned long h = 0; 267 unsigned long ix; 268 for (i = 0; i < len; i++) { 269 h = (h * scale) + (unsigned char) toupper(s[i]); 270 ix = h & 0xff000000; 271 if (ix) 272 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; 273 } 274 return h; 275} 276 277static int 278_getname(Py_UCS4 code, char* buffer, int buflen) 279{ 280 int offset; 281 int i; 282 int word; 283 unsigned char* w; 284 285 if (code < 0 || code >= 65536) 286 return 0; 287 288 /* get offset into phrasebook */ 289 offset = phrasebook_offset1[(code>>phrasebook_shift)]; 290 offset = phrasebook_offset2[(offset<<phrasebook_shift) + 291 (code&((1<<phrasebook_shift)-1))]; 292 if (!offset) 293 return 0; 294 295 i = 0; 296 297 for (;;) { 298 /* get word index */ 299 word = phrasebook[offset] - phrasebook_short; 300 if (word >= 0) { 301 word = (word << 8) + phrasebook[offset+1]; 302 offset += 2; 303 } else 304 word = phrasebook[offset++]; 305 if (i) { 306 if (i > buflen) 307 return 0; /* buffer overflow */ 308 buffer[i++] = ' '; 309 } 310 /* copy word string from lexicon. the last character in the 311 word has bit 7 set. the last word in a string ends with 312 0x80 */ 313 w = lexicon + lexicon_offset[word]; 314 while (*w < 128) { 315 if (i >= buflen) 316 return 0; /* buffer overflow */ 317 buffer[i++] = *w++; 318 } 319 if (i >= buflen) 320 return 0; /* buffer overflow */ 321 buffer[i++] = *w & 127; 322 if (*w == 128) 323 break; /* end of word */ 324 } 325 326 return 1; 327} 328 329static int 330_cmpname(int code, const char* name, int namelen) 331{ 332 /* check if code corresponds to the given name */ 333 int i; 334 char buffer[NAME_MAXLEN]; 335 if (!_getname(code, buffer, sizeof(buffer))) 336 return 0; 337 for (i = 0; i < namelen; i++) { 338 if (toupper(name[i]) != buffer[i]) 339 return 0; 340 } 341 return buffer[namelen] == '\0'; 342} 343 344static int 345_getcode(const char* name, int namelen, Py_UCS4* code) 346{ 347 unsigned int h, v; 348 unsigned int mask = code_size-1; 349 unsigned int i, incr; 350 351 /* the following is the same as python's dictionary lookup, with 352 only minor changes. see the makeunicodedata script for more 353 details */ 354 355 h = (unsigned int) _gethash(name, namelen, code_magic); 356 i = (~h) & mask; 357 v = code_hash[i]; 358 if (!v) 359 return 0; 360 if (_cmpname(v, name, namelen)) { 361 *code = v; 362 return 1; 363 } 364 incr = (h ^ (h >> 3)) & mask; 365 if (!incr) 366 incr = mask; 367 for (;;) { 368 i = (i + incr) & mask; 369 v = code_hash[i]; 370 if (!v) 371 return 0; 372 if (_cmpname(v, name, namelen)) { 373 *code = v; 374 return 1; 375 } 376 incr = incr << 1; 377 if (incr > mask) 378 incr = incr ^ code_poly; 379 } 380} 381 382static const _PyUnicode_Name_CAPI hashAPI = 383{ 384 sizeof(_PyUnicode_Name_CAPI), 385 _getname, 386 _getcode 387}; 388 389/* -------------------------------------------------------------------- */ 390/* Python bindings */ 391 392static PyObject * 393unicodedata_name(PyObject* self, PyObject* args) 394{ 395 char name[NAME_MAXLEN]; 396 397 PyUnicodeObject* v; 398 PyObject* defobj = NULL; 399 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) 400 return NULL; 401 402 if (PyUnicode_GET_SIZE(v) != 1) { 403 PyErr_SetString(PyExc_TypeError, 404 "need a single Unicode character as parameter"); 405 return NULL; 406 } 407 408 if (!_getname((Py_UCS4) *PyUnicode_AS_UNICODE(v), 409 name, sizeof(name))) { 410 if (defobj == NULL) { 411 PyErr_SetString(PyExc_ValueError, "no such name"); 412 return NULL; 413 } 414 else { 415 Py_INCREF(defobj); 416 return defobj; 417 } 418 } 419 420 return Py_BuildValue("s", name); 421} 422 423static PyObject * 424unicodedata_lookup(PyObject* self, PyObject* args) 425{ 426 Py_UCS4 code; 427 Py_UNICODE str[1]; 428 429 char* name; 430 int namelen; 431 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) 432 return NULL; 433 434 if (!_getcode(name, namelen, &code)) { 435 PyErr_SetString(PyExc_KeyError, "undefined character name"); 436 return NULL; 437 } 438 439 str[0] = (Py_UNICODE) code; 440 return PyUnicode_FromUnicode(str, 1); 441} 442 443/* XXX Add doc strings. */ 444 445static PyMethodDef unicodedata_functions[] = { 446 {"decimal", unicodedata_decimal, METH_VARARGS}, 447 {"digit", unicodedata_digit, METH_VARARGS}, 448 {"numeric", unicodedata_numeric, METH_VARARGS}, 449 {"category", unicodedata_category, METH_VARARGS}, 450 {"bidirectional", unicodedata_bidirectional, METH_VARARGS}, 451 {"combining", unicodedata_combining, METH_VARARGS}, 452 {"mirrored", unicodedata_mirrored, METH_VARARGS}, 453 {"decomposition",unicodedata_decomposition, METH_VARARGS}, 454 {"name", unicodedata_name, METH_VARARGS}, 455 {"lookup", unicodedata_lookup, METH_VARARGS}, 456 {NULL, NULL} /* sentinel */ 457}; 458 459static char *unicodedata_docstring = "unicode character database"; 460 461DL_EXPORT(void) 462initunicodedata(void) 463{ 464 PyObject *m, *d, *v; 465 466 m = Py_InitModule3( 467 "unicodedata", unicodedata_functions, unicodedata_docstring); 468 if (!m) 469 return; 470 471 d = PyModule_GetDict(m); 472 if (!d) 473 return; 474 475 /* Export C API */ 476 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); 477 if (v != NULL) { 478 PyDict_SetItemString(d, "ucnhash_CAPI", v); 479 Py_DECREF(v); 480 } 481} 482