unicodedata.c revision 6fc2382883516434c5876e768fabd75a0c633d84
1/* ------------------------------------------------------------------------ 2 3 unicodedata -- Provides access to the Unicode 4.1 data base. 4 5 Data was extracted from the Unicode 4.1 UnicodeData.txt file. 6 7 Written by Marc-Andre Lemburg (mal@lemburg.com). 8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) 9 Modified by Martin v. L�wis (martin@v.loewis.de) 10 11 Copyright (c) Corporation for National Research Initiatives. 12 13 ------------------------------------------------------------------------ */ 14 15#include "Python.h" 16#include "ucnhash.h" 17#include "structmember.h" 18 19/* character properties */ 20 21typedef struct { 22 const unsigned char category; /* index into 23 _PyUnicode_CategoryNames */ 24 const unsigned char combining; /* combining class value 0 - 255 */ 25 const unsigned char bidirectional; /* index into 26 _PyUnicode_BidirectionalNames */ 27 const unsigned char mirrored; /* true if mirrored in bidir mode */ 28 const unsigned char east_asian_width; /* index into 29 _PyUnicode_EastAsianWidth */ 30} _PyUnicode_DatabaseRecord; 31 32typedef struct change_record { 33 /* sequence of fields should be the same as in merge_old_version */ 34 const unsigned char bidir_changed; 35 const unsigned char category_changed; 36 const unsigned char decimal_changed; 37 const int numeric_changed; 38} change_record; 39 40/* data file generated by Tools/unicode/makeunicodedata.py */ 41#include "unicodedata_db.h" 42 43static const _PyUnicode_DatabaseRecord* 44_getrecord_ex(Py_UCS4 code) 45{ 46 int index; 47 if (code >= 0x110000) 48 index = 0; 49 else { 50 index = index1[(code>>SHIFT)]; 51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 52 } 53 54 return &_PyUnicode_Database_Records[index]; 55} 56 57static const _PyUnicode_DatabaseRecord* 58_getrecord(PyUnicodeObject* v) 59{ 60 return _getrecord_ex(*PyUnicode_AS_UNICODE(v)); 61} 62 63/* ------------- Previous-version API ------------------------------------- */ 64typedef struct previous_version { 65 PyObject_HEAD 66 const char *name; 67 const change_record* (*getrecord)(Py_UCS4); 68 Py_UCS4 (*normalization)(Py_UCS4); 69} PreviousDBVersion; 70 71#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v)) 72 73static PyMemberDef DB_members[] = { 74 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, 75 {NULL} 76}; 77 78/* forward declaration */ 79static PyTypeObject UCD_Type; 80 81static PyObject* 82new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), 83 Py_UCS4 (*normalization)(Py_UCS4)) 84{ 85 PreviousDBVersion *self; 86 self = PyObject_New(PreviousDBVersion, &UCD_Type); 87 if (self == NULL) 88 return NULL; 89 self->name = name; 90 self->getrecord = getrecord; 91 self->normalization = normalization; 92 return (PyObject*)self; 93} 94 95/* --- Module API --------------------------------------------------------- */ 96 97PyDoc_STRVAR(unicodedata_decimal__doc__, 98"decimal(unichr[, default])\n\ 99\n\ 100Returns the decimal value assigned to the Unicode character unichr\n\ 101as integer. If no such value is defined, default is returned, or, if\n\ 102not given, ValueError is raised."); 103 104static PyObject * 105unicodedata_decimal(PyObject *self, PyObject *args) 106{ 107 PyUnicodeObject *v; 108 PyObject *defobj = NULL; 109 int have_old = 0; 110 long rc; 111 112 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) 113 return NULL; 114 if (PyUnicode_GET_SIZE(v) != 1) { 115 PyErr_SetString(PyExc_TypeError, 116 "need a single Unicode character as parameter"); 117 return NULL; 118 } 119 120 if (self) { 121 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); 122 if (old->category_changed == 0) { 123 /* unassigned */ 124 have_old = 1; 125 rc = -1; 126 } 127 else if (old->decimal_changed != 0xFF) { 128 have_old = 1; 129 rc = old->decimal_changed; 130 } 131 } 132 133 if (!have_old) 134 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v)); 135 if (rc < 0) { 136 if (defobj == NULL) { 137 PyErr_SetString(PyExc_ValueError, 138 "not a decimal"); 139 return NULL; 140 } 141 else { 142 Py_INCREF(defobj); 143 return defobj; 144 } 145 } 146 return PyInt_FromLong(rc); 147} 148 149PyDoc_STRVAR(unicodedata_digit__doc__, 150"digit(unichr[, default])\n\ 151\n\ 152Returns the digit value assigned to the Unicode character unichr as\n\ 153integer. If no such value is defined, default is returned, or, if\n\ 154not given, ValueError is raised."); 155 156static PyObject * 157unicodedata_digit(PyObject *self, PyObject *args) 158{ 159 PyUnicodeObject *v; 160 PyObject *defobj = NULL; 161 long rc; 162 163 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) 164 return NULL; 165 if (PyUnicode_GET_SIZE(v) != 1) { 166 PyErr_SetString(PyExc_TypeError, 167 "need a single Unicode character as parameter"); 168 return NULL; 169 } 170 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v)); 171 if (rc < 0) { 172 if (defobj == NULL) { 173 PyErr_SetString(PyExc_ValueError, "not a digit"); 174 return NULL; 175 } 176 else { 177 Py_INCREF(defobj); 178 return defobj; 179 } 180 } 181 return PyInt_FromLong(rc); 182} 183 184PyDoc_STRVAR(unicodedata_numeric__doc__, 185"numeric(unichr[, default])\n\ 186\n\ 187Returns the numeric value assigned to the Unicode character unichr\n\ 188as float. If no such value is defined, default is returned, or, if\n\ 189not given, ValueError is raised."); 190 191static PyObject * 192unicodedata_numeric(PyObject *self, PyObject *args) 193{ 194 PyUnicodeObject *v; 195 PyObject *defobj = NULL; 196 int have_old = 0; 197 double rc; 198 199 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) 200 return NULL; 201 if (PyUnicode_GET_SIZE(v) != 1) { 202 PyErr_SetString(PyExc_TypeError, 203 "need a single Unicode character as parameter"); 204 return NULL; 205 } 206 207 if (self) { 208 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); 209 if (old->category_changed == 0) { 210 /* unassigned */ 211 have_old = 1; 212 rc = -1.0; 213 } 214 else if (old->decimal_changed != 0xFF) { 215 have_old = 1; 216 rc = old->decimal_changed; 217 } 218 } 219 220 if (!have_old) 221 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v)); 222 if (rc == -1.0) { 223 if (defobj == NULL) { 224 PyErr_SetString(PyExc_ValueError, "not a numeric character"); 225 return NULL; 226 } 227 else { 228 Py_INCREF(defobj); 229 return defobj; 230 } 231 } 232 return PyFloat_FromDouble(rc); 233} 234 235PyDoc_STRVAR(unicodedata_category__doc__, 236"category(unichr)\n\ 237\n\ 238Returns the general category assigned to the Unicode character\n\ 239unichr as string."); 240 241static PyObject * 242unicodedata_category(PyObject *self, PyObject *args) 243{ 244 PyUnicodeObject *v; 245 int index; 246 247 if (!PyArg_ParseTuple(args, "O!:category", 248 &PyUnicode_Type, &v)) 249 return NULL; 250 if (PyUnicode_GET_SIZE(v) != 1) { 251 PyErr_SetString(PyExc_TypeError, 252 "need a single Unicode character as parameter"); 253 return NULL; 254 } 255 index = (int) _getrecord(v)->category; 256 if (self) { 257 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); 258 if (old->category_changed != 0xFF) 259 index = old->category_changed; 260 } 261 return PyString_FromString(_PyUnicode_CategoryNames[index]); 262} 263 264PyDoc_STRVAR(unicodedata_bidirectional__doc__, 265"bidirectional(unichr)\n\ 266\n\ 267Returns the bidirectional category assigned to the Unicode character\n\ 268unichr as string. If no such value is defined, an empty string is\n\ 269returned."); 270 271static PyObject * 272unicodedata_bidirectional(PyObject *self, PyObject *args) 273{ 274 PyUnicodeObject *v; 275 int index; 276 277 if (!PyArg_ParseTuple(args, "O!:bidirectional", 278 &PyUnicode_Type, &v)) 279 return NULL; 280 if (PyUnicode_GET_SIZE(v) != 1) { 281 PyErr_SetString(PyExc_TypeError, 282 "need a single Unicode character as parameter"); 283 return NULL; 284 } 285 index = (int) _getrecord(v)->bidirectional; 286 if (self) { 287 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); 288 if (old->category_changed == 0) 289 index = 0; /* unassigned */ 290 else if (old->bidir_changed != 0xFF) 291 index = old->bidir_changed; 292 } 293 return PyString_FromString(_PyUnicode_BidirectionalNames[index]); 294} 295 296PyDoc_STRVAR(unicodedata_combining__doc__, 297"combining(unichr)\n\ 298\n\ 299Returns the canonical combining class assigned to the Unicode\n\ 300character unichr as integer. Returns 0 if no combining class is\n\ 301defined."); 302 303static PyObject * 304unicodedata_combining(PyObject *self, PyObject *args) 305{ 306 PyUnicodeObject *v; 307 int index; 308 309 if (!PyArg_ParseTuple(args, "O!:combining", 310 &PyUnicode_Type, &v)) 311 return NULL; 312 if (PyUnicode_GET_SIZE(v) != 1) { 313 PyErr_SetString(PyExc_TypeError, 314 "need a single Unicode character as parameter"); 315 return NULL; 316 } 317 index = (int) _getrecord(v)->combining; 318 if (self) { 319 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); 320 if (old->category_changed == 0) 321 index = 0; /* unassigned */ 322 } 323 return PyInt_FromLong(index); 324} 325 326PyDoc_STRVAR(unicodedata_mirrored__doc__, 327"mirrored(unichr)\n\ 328\n\ 329Returns the mirrored property assigned to the Unicode character\n\ 330unichr as integer. Returns 1 if the character has been identified as\n\ 331a \"mirrored\" character in bidirectional text, 0 otherwise."); 332 333static PyObject * 334unicodedata_mirrored(PyObject *self, PyObject *args) 335{ 336 PyUnicodeObject *v; 337 int index; 338 339 if (!PyArg_ParseTuple(args, "O!:mirrored", 340 &PyUnicode_Type, &v)) 341 return NULL; 342 if (PyUnicode_GET_SIZE(v) != 1) { 343 PyErr_SetString(PyExc_TypeError, 344 "need a single Unicode character as parameter"); 345 return NULL; 346 } 347 index = (int) _getrecord(v)->mirrored; 348 if (self) { 349 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); 350 if (old->category_changed == 0) 351 index = 0; /* unassigned */ 352 } 353 return PyInt_FromLong(index); 354} 355 356PyDoc_STRVAR(unicodedata_east_asian_width__doc__, 357"east_asian_width(unichr)\n\ 358\n\ 359Returns the east asian width assigned to the Unicode character\n\ 360unichr as string."); 361 362static PyObject * 363unicodedata_east_asian_width(PyObject *self, PyObject *args) 364{ 365 PyUnicodeObject *v; 366 int index; 367 368 if (!PyArg_ParseTuple(args, "O!:east_asian_width", 369 &PyUnicode_Type, &v)) 370 return NULL; 371 if (PyUnicode_GET_SIZE(v) != 1) { 372 PyErr_SetString(PyExc_TypeError, 373 "need a single Unicode character as parameter"); 374 return NULL; 375 } 376 index = (int) _getrecord(v)->east_asian_width; 377 if (self) { 378 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); 379 if (old->category_changed == 0) 380 index = 0; /* unassigned */ 381 } 382 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); 383} 384 385PyDoc_STRVAR(unicodedata_decomposition__doc__, 386"decomposition(unichr)\n\ 387\n\ 388Returns the character decomposition mapping assigned to the Unicode\n\ 389character unichr as string. An empty string is returned in case no\n\ 390such mapping is defined."); 391 392static PyObject * 393unicodedata_decomposition(PyObject *self, PyObject *args) 394{ 395 PyUnicodeObject *v; 396 char decomp[256]; 397 int code, index, count, i; 398 unsigned int prefix_index; 399 400 if (!PyArg_ParseTuple(args, "O!:decomposition", 401 &PyUnicode_Type, &v)) 402 return NULL; 403 if (PyUnicode_GET_SIZE(v) != 1) { 404 PyErr_SetString(PyExc_TypeError, 405 "need a single Unicode character as parameter"); 406 return NULL; 407 } 408 409 code = (int) *PyUnicode_AS_UNICODE(v); 410 411 if (self) { 412 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); 413 if (old->category_changed == 0) 414 return PyString_FromString(""); /* unassigned */ 415 } 416 417 if (code < 0 || code >= 0x110000) 418 index = 0; 419 else { 420 index = decomp_index1[(code>>DECOMP_SHIFT)]; 421 index = decomp_index2[(index<<DECOMP_SHIFT)+ 422 (code&((1<<DECOMP_SHIFT)-1))]; 423 } 424 425 /* high byte is number of hex bytes (usually one or two), low byte 426 is prefix code (from*/ 427 count = decomp_data[index] >> 8; 428 429 /* XXX: could allocate the PyString up front instead 430 (strlen(prefix) + 5 * count + 1 bytes) */ 431 432 /* Based on how index is calculated above and decomp_data is generated 433 from Tools/unicode/makeunicodedata.py, it should not be possible 434 to overflow decomp_prefix. */ 435 prefix_index = decomp_data[index] & 255; 436 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix))); 437 438 /* copy prefix */ 439 i = strlen(decomp_prefix[prefix_index]); 440 memcpy(decomp, decomp_prefix[prefix_index], i); 441 442 while (count-- > 0) { 443 if (i) 444 decomp[i++] = ' '; 445 assert((size_t)i < sizeof(decomp)); 446 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", 447 decomp_data[++index]); 448 i += strlen(decomp + i); 449 } 450 451 decomp[i] = '\0'; 452 453 return PyString_FromString(decomp); 454} 455 456static void 457get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) 458{ 459 if (code >= 0x110000) { 460 *index = 0; 461 } else if (self && get_old_record(self, code)->category_changed==0) { 462 /* unassigned in old version */ 463 *index = 0; 464 } 465 else { 466 *index = decomp_index1[(code>>DECOMP_SHIFT)]; 467 *index = decomp_index2[(*index<<DECOMP_SHIFT)+ 468 (code&((1<<DECOMP_SHIFT)-1))]; 469 } 470 471 /* high byte is number of hex bytes (usually one or two), low byte 472 is prefix code (from*/ 473 *count = decomp_data[*index] >> 8; 474 *prefix = decomp_data[*index] & 255; 475 476 (*index)++; 477} 478 479#define SBase 0xAC00 480#define LBase 0x1100 481#define VBase 0x1161 482#define TBase 0x11A7 483#define LCount 19 484#define VCount 21 485#define TCount 28 486#define NCount (VCount*TCount) 487#define SCount (LCount*NCount) 488 489static PyObject* 490nfd_nfkd(PyObject *self, PyObject *input, int k) 491{ 492 PyObject *result; 493 Py_UNICODE *i, *end, *o; 494 /* Longest decomposition in Unicode 3.2: U+FDFA */ 495 Py_UNICODE stack[20]; 496 Py_ssize_t space, isize; 497 int index, prefix, count, stackptr; 498 unsigned char prev, cur; 499 500 stackptr = 0; 501 isize = PyUnicode_GET_SIZE(input); 502 /* Overallocate atmost 10 characters. */ 503 space = (isize > 10 ? 10 : isize) + isize; 504 result = PyUnicode_FromUnicode(NULL, space); 505 if (!result) 506 return NULL; 507 i = PyUnicode_AS_UNICODE(input); 508 end = i + isize; 509 o = PyUnicode_AS_UNICODE(result); 510 511 while (i < end) { 512 stack[stackptr++] = *i++; 513 while(stackptr) { 514 Py_UNICODE code = stack[--stackptr]; 515 /* Hangul Decomposition adds three characters in 516 a single step, so we need atleast that much room. */ 517 if (space < 3) { 518 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10; 519 space += 10; 520 if (PyUnicode_Resize(&result, newsize) == -1) 521 return NULL; 522 o = PyUnicode_AS_UNICODE(result) + newsize - space; 523 } 524 /* Hangul Decomposition. */ 525 if (SBase <= code && code < (SBase+SCount)) { 526 int SIndex = code - SBase; 527 int L = LBase + SIndex / NCount; 528 int V = VBase + (SIndex % NCount) / TCount; 529 int T = TBase + SIndex % TCount; 530 *o++ = L; 531 *o++ = V; 532 space -= 2; 533 if (T != TBase) { 534 *o++ = T; 535 space --; 536 } 537 continue; 538 } 539 /* normalization changes */ 540 if (self) { 541 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); 542 if (value != 0) { 543 stack[stackptr++] = value; 544 continue; 545 } 546 } 547 548 /* Other decompositions. */ 549 get_decomp_record(self, code, &index, &prefix, &count); 550 551 /* Copy character if it is not decomposable, or has a 552 compatibility decomposition, but we do NFD. */ 553 if (!count || (prefix && !k)) { 554 *o++ = code; 555 space--; 556 continue; 557 } 558 /* Copy decomposition onto the stack, in reverse 559 order. */ 560 while(count) { 561 code = decomp_data[index + (--count)]; 562 stack[stackptr++] = code; 563 } 564 } 565 } 566 567 /* Drop overallocation. Cannot fail. */ 568 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space); 569 570 /* Sort canonically. */ 571 i = PyUnicode_AS_UNICODE(result); 572 prev = _getrecord_ex(*i)->combining; 573 end = i + PyUnicode_GET_SIZE(result); 574 for (i++; i < end; i++) { 575 cur = _getrecord_ex(*i)->combining; 576 if (prev == 0 || cur == 0 || prev <= cur) { 577 prev = cur; 578 continue; 579 } 580 /* Non-canonical order. Need to switch *i with previous. */ 581 o = i - 1; 582 while (1) { 583 Py_UNICODE tmp = o[1]; 584 o[1] = o[0]; 585 o[0] = tmp; 586 o--; 587 if (o < PyUnicode_AS_UNICODE(result)) 588 break; 589 prev = _getrecord_ex(*o)->combining; 590 if (prev == 0 || prev <= cur) 591 break; 592 } 593 prev = _getrecord_ex(*i)->combining; 594 } 595 return result; 596} 597 598static int 599find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code) 600{ 601 int index; 602 for (index = 0; nfc[index].start; index++) { 603 int start = nfc[index].start; 604 if (code < start) 605 return -1; 606 if (code <= start + nfc[index].count) { 607 int delta = code - start; 608 return nfc[index].index + delta; 609 } 610 } 611 return -1; 612} 613 614static PyObject* 615nfc_nfkc(PyObject *self, PyObject *input, int k) 616{ 617 PyObject *result; 618 Py_UNICODE *i, *i1, *o, *end; 619 int f,l,index,index1,comb; 620 Py_UNICODE code; 621 Py_UNICODE *skipped[20]; 622 int cskipped = 0; 623 624 result = nfd_nfkd(self, input, k); 625 if (!result) 626 return NULL; 627 628 /* We are going to modify result in-place. 629 If nfd_nfkd is changed to sometimes return the input, 630 this code needs to be reviewed. */ 631 assert(result != input); 632 633 i = PyUnicode_AS_UNICODE(result); 634 end = i + PyUnicode_GET_SIZE(result); 635 o = PyUnicode_AS_UNICODE(result); 636 637 again: 638 while (i < end) { 639 for (index = 0; index < cskipped; index++) { 640 if (skipped[index] == i) { 641 /* *i character is skipped. 642 Remove from list. */ 643 skipped[index] = skipped[cskipped-1]; 644 cskipped--; 645 i++; 646 goto again; /* continue while */ 647 } 648 } 649 /* Hangul Composition. We don't need to check for <LV,T> 650 pairs, since we always have decomposed data. */ 651 if (LBase <= *i && *i < (LBase+LCount) && 652 i + 1 < end && 653 VBase <= i[1] && i[1] <= (VBase+VCount)) { 654 int LIndex, VIndex; 655 LIndex = i[0] - LBase; 656 VIndex = i[1] - VBase; 657 code = SBase + (LIndex*VCount+VIndex)*TCount; 658 i+=2; 659 if (i < end && 660 TBase <= *i && *i <= (TBase+TCount)) { 661 code += *i-TBase; 662 i++; 663 } 664 *o++ = code; 665 continue; 666 } 667 668 f = find_nfc_index(self, nfc_first, *i); 669 if (f == -1) { 670 *o++ = *i++; 671 continue; 672 } 673 /* Find next unblocked character. */ 674 i1 = i+1; 675 comb = 0; 676 while (i1 < end) { 677 int comb1 = _getrecord_ex(*i1)->combining; 678 if (comb1 && comb == comb1) { 679 /* Character is blocked. */ 680 i1++; 681 continue; 682 } 683 l = find_nfc_index(self, nfc_last, *i1); 684 /* *i1 cannot be combined with *i. If *i1 685 is a starter, we don't need to look further. 686 Otherwise, record the combining class. */ 687 if (l == -1) { 688 not_combinable: 689 if (comb1 == 0) 690 break; 691 comb = comb1; 692 i1++; 693 continue; 694 } 695 index = f*TOTAL_LAST + l; 696 index1 = comp_index[index >> COMP_SHIFT]; 697 code = comp_data[(index1<<COMP_SHIFT)+ 698 (index&((1<<COMP_SHIFT)-1))]; 699 if (code == 0) 700 goto not_combinable; 701 702 /* Replace the original character. */ 703 *i = code; 704 /* Mark the second character unused. */ 705 skipped[cskipped++] = i1; 706 i1++; 707 f = find_nfc_index(self, nfc_first, *i); 708 if (f == -1) 709 break; 710 } 711 *o++ = *i++; 712 } 713 if (o != end) 714 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result)); 715 return result; 716} 717 718PyDoc_STRVAR(unicodedata_normalize__doc__, 719"normalize(form, unistr)\n\ 720\n\ 721Return the normal form 'form' for the Unicode string unistr. Valid\n\ 722values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'."); 723 724static PyObject* 725unicodedata_normalize(PyObject *self, PyObject *args) 726{ 727 char *form; 728 PyObject *input; 729 730 if(!PyArg_ParseTuple(args, "sO!:normalize", 731 &form, &PyUnicode_Type, &input)) 732 return NULL; 733 734 if (PyUnicode_GetSize(input) == 0) { 735 /* Special case empty input strings, since resizing 736 them later would cause internal errors. */ 737 Py_INCREF(input); 738 return input; 739 } 740 741 if (strcmp(form, "NFC") == 0) 742 return nfc_nfkc(self, input, 0); 743 if (strcmp(form, "NFKC") == 0) 744 return nfc_nfkc(self, input, 1); 745 if (strcmp(form, "NFD") == 0) 746 return nfd_nfkd(self, input, 0); 747 if (strcmp(form, "NFKD") == 0) 748 return nfd_nfkd(self, input, 1); 749 PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 750 return NULL; 751} 752 753/* -------------------------------------------------------------------- */ 754/* unicode character name tables */ 755 756/* data file generated by Tools/unicode/makeunicodedata.py */ 757#include "unicodename_db.h" 758 759/* -------------------------------------------------------------------- */ 760/* database code (cut and pasted from the unidb package) */ 761 762static unsigned long 763_gethash(const char *s, int len, int scale) 764{ 765 int i; 766 unsigned long h = 0; 767 unsigned long ix; 768 for (i = 0; i < len; i++) { 769 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i])); 770 ix = h & 0xff000000; 771 if (ix) 772 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; 773 } 774 return h; 775} 776 777static char *hangul_syllables[][3] = { 778 { "G", "A", "" }, 779 { "GG", "AE", "G" }, 780 { "N", "YA", "GG" }, 781 { "D", "YAE", "GS" }, 782 { "DD", "EO", "N", }, 783 { "R", "E", "NJ" }, 784 { "M", "YEO", "NH" }, 785 { "B", "YE", "D" }, 786 { "BB", "O", "L" }, 787 { "S", "WA", "LG" }, 788 { "SS", "WAE", "LM" }, 789 { "", "OE", "LB" }, 790 { "J", "YO", "LS" }, 791 { "JJ", "U", "LT" }, 792 { "C", "WEO", "LP" }, 793 { "K", "WE", "LH" }, 794 { "T", "WI", "M" }, 795 { "P", "YU", "B" }, 796 { "H", "EU", "BS" }, 797 { 0, "YI", "S" }, 798 { 0, "I", "SS" }, 799 { 0, 0, "NG" }, 800 { 0, 0, "J" }, 801 { 0, 0, "C" }, 802 { 0, 0, "K" }, 803 { 0, 0, "T" }, 804 { 0, 0, "P" }, 805 { 0, 0, "H" } 806}; 807 808static int 809is_unified_ideograph(Py_UCS4 code) 810{ 811 return ( 812 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ 813 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */ 814 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */ 815} 816 817static int 818_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen) 819{ 820 int offset; 821 int i; 822 int word; 823 unsigned char* w; 824 825 if (code >= 0x110000) 826 return 0; 827 828 if (self) { 829 const change_record *old = get_old_record(self, code); 830 if (old->category_changed == 0) { 831 /* unassigned */ 832 return 0; 833 } 834 } 835 836 if (SBase <= code && code < SBase+SCount) { 837 /* Hangul syllable. */ 838 int SIndex = code - SBase; 839 int L = SIndex / NCount; 840 int V = (SIndex % NCount) / TCount; 841 int T = SIndex % TCount; 842 843 if (buflen < 27) 844 /* Worst case: HANGUL SYLLABLE <10chars>. */ 845 return 0; 846 strcpy(buffer, "HANGUL SYLLABLE "); 847 buffer += 16; 848 strcpy(buffer, hangul_syllables[L][0]); 849 buffer += strlen(hangul_syllables[L][0]); 850 strcpy(buffer, hangul_syllables[V][1]); 851 buffer += strlen(hangul_syllables[V][1]); 852 strcpy(buffer, hangul_syllables[T][2]); 853 buffer += strlen(hangul_syllables[T][2]); 854 *buffer = '\0'; 855 return 1; 856 } 857 858 if (is_unified_ideograph(code)) { 859 if (buflen < 28) 860 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ 861 return 0; 862 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); 863 return 1; 864 } 865 866 /* get offset into phrasebook */ 867 offset = phrasebook_offset1[(code>>phrasebook_shift)]; 868 offset = phrasebook_offset2[(offset<<phrasebook_shift) + 869 (code&((1<<phrasebook_shift)-1))]; 870 if (!offset) 871 return 0; 872 873 i = 0; 874 875 for (;;) { 876 /* get word index */ 877 word = phrasebook[offset] - phrasebook_short; 878 if (word >= 0) { 879 word = (word << 8) + phrasebook[offset+1]; 880 offset += 2; 881 } else 882 word = phrasebook[offset++]; 883 if (i) { 884 if (i > buflen) 885 return 0; /* buffer overflow */ 886 buffer[i++] = ' '; 887 } 888 /* copy word string from lexicon. the last character in the 889 word has bit 7 set. the last word in a string ends with 890 0x80 */ 891 w = lexicon + lexicon_offset[word]; 892 while (*w < 128) { 893 if (i >= buflen) 894 return 0; /* buffer overflow */ 895 buffer[i++] = *w++; 896 } 897 if (i >= buflen) 898 return 0; /* buffer overflow */ 899 buffer[i++] = *w & 127; 900 if (*w == 128) 901 break; /* end of word */ 902 } 903 904 return 1; 905} 906 907static int 908_cmpname(PyObject *self, int code, const char* name, int namelen) 909{ 910 /* check if code corresponds to the given name */ 911 int i; 912 char buffer[NAME_MAXLEN]; 913 if (!_getucname(self, code, buffer, sizeof(buffer))) 914 return 0; 915 for (i = 0; i < namelen; i++) { 916 if (toupper(Py_CHARMASK(name[i])) != buffer[i]) 917 return 0; 918 } 919 return buffer[namelen] == '\0'; 920} 921 922static void 923find_syllable(const char *str, int *len, int *pos, int count, int column) 924{ 925 int i, len1; 926 *len = -1; 927 for (i = 0; i < count; i++) { 928 char *s = hangul_syllables[i][column]; 929 len1 = strlen(s); 930 if (len1 <= *len) 931 continue; 932 if (strncmp(str, s, len1) == 0) { 933 *len = len1; 934 *pos = i; 935 } 936 } 937 if (*len == -1) { 938 *len = 0; 939 } 940} 941 942static int 943_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) 944{ 945 unsigned int h, v; 946 unsigned int mask = code_size-1; 947 unsigned int i, incr; 948 949 /* Check for hangul syllables. */ 950 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { 951 int len, L = -1, V = -1, T = -1; 952 const char *pos = name + 16; 953 find_syllable(pos, &len, &L, LCount, 0); 954 pos += len; 955 find_syllable(pos, &len, &V, VCount, 1); 956 pos += len; 957 find_syllable(pos, &len, &T, TCount, 2); 958 pos += len; 959 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { 960 *code = SBase + (L*VCount+V)*TCount + T; 961 return 1; 962 } 963 /* Otherwise, it's an illegal syllable name. */ 964 return 0; 965 } 966 967 /* Check for unified ideographs. */ 968 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { 969 /* Four or five hexdigits must follow. */ 970 v = 0; 971 name += 22; 972 namelen -= 22; 973 if (namelen != 4 && namelen != 5) 974 return 0; 975 while (namelen--) { 976 v *= 16; 977 if (*name >= '0' && *name <= '9') 978 v += *name - '0'; 979 else if (*name >= 'A' && *name <= 'F') 980 v += *name - 'A' + 10; 981 else 982 return 0; 983 name++; 984 } 985 if (!is_unified_ideograph(v)) 986 return 0; 987 *code = v; 988 return 1; 989 } 990 991 /* the following is the same as python's dictionary lookup, with 992 only minor changes. see the makeunicodedata script for more 993 details */ 994 995 h = (unsigned int) _gethash(name, namelen, code_magic); 996 i = (~h) & mask; 997 v = code_hash[i]; 998 if (!v) 999 return 0; 1000 if (_cmpname(self, v, name, namelen)) { 1001 *code = v; 1002 return 1; 1003 } 1004 incr = (h ^ (h >> 3)) & mask; 1005 if (!incr) 1006 incr = mask; 1007 for (;;) { 1008 i = (i + incr) & mask; 1009 v = code_hash[i]; 1010 if (!v) 1011 return 0; 1012 if (_cmpname(self, v, name, namelen)) { 1013 *code = v; 1014 return 1; 1015 } 1016 incr = incr << 1; 1017 if (incr > mask) 1018 incr = incr ^ code_poly; 1019 } 1020} 1021 1022static const _PyUnicode_Name_CAPI hashAPI = 1023{ 1024 sizeof(_PyUnicode_Name_CAPI), 1025 _getucname, 1026 _getcode 1027}; 1028 1029/* -------------------------------------------------------------------- */ 1030/* Python bindings */ 1031 1032PyDoc_STRVAR(unicodedata_name__doc__, 1033"name(unichr[, default])\n\ 1034Returns the name assigned to the Unicode character unichr as a\n\ 1035string. If no name is defined, default is returned, or, if not\n\ 1036given, ValueError is raised."); 1037 1038static PyObject * 1039unicodedata_name(PyObject* self, PyObject* args) 1040{ 1041 char name[NAME_MAXLEN]; 1042 1043 PyUnicodeObject* v; 1044 PyObject* defobj = NULL; 1045 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) 1046 return NULL; 1047 1048 if (PyUnicode_GET_SIZE(v) != 1) { 1049 PyErr_SetString(PyExc_TypeError, 1050 "need a single Unicode character as parameter"); 1051 return NULL; 1052 } 1053 1054 if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v), 1055 name, sizeof(name))) { 1056 if (defobj == NULL) { 1057 PyErr_SetString(PyExc_ValueError, "no such name"); 1058 return NULL; 1059 } 1060 else { 1061 Py_INCREF(defobj); 1062 return defobj; 1063 } 1064 } 1065 1066 return Py_BuildValue("s", name); 1067} 1068 1069PyDoc_STRVAR(unicodedata_lookup__doc__, 1070"lookup(name)\n\ 1071\n\ 1072Look up character by name. If a character with the\n\ 1073given name is found, return the corresponding Unicode\n\ 1074character. If not found, KeyError is raised."); 1075 1076static PyObject * 1077unicodedata_lookup(PyObject* self, PyObject* args) 1078{ 1079 Py_UCS4 code; 1080 Py_UNICODE str[1]; 1081 char errbuf[256]; 1082 1083 char* name; 1084 int namelen; 1085 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) 1086 return NULL; 1087 1088 if (!_getcode(self, name, namelen, &code)) { 1089 /* XXX(nnorwitz): why are we allocating for the error msg? 1090 Why not always use snprintf? */ 1091 char fmt[] = "undefined character name '%s'"; 1092 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen); 1093 if (buf) 1094 sprintf(buf, fmt, name); 1095 else { 1096 buf = errbuf; 1097 PyOS_snprintf(buf, sizeof(errbuf), fmt, name); 1098 } 1099 PyErr_SetString(PyExc_KeyError, buf); 1100 if (buf != errbuf) 1101 PyMem_FREE(buf); 1102 return NULL; 1103 } 1104 1105 str[0] = (Py_UNICODE) code; 1106 return PyUnicode_FromUnicode(str, 1); 1107} 1108 1109/* XXX Add doc strings. */ 1110 1111static PyMethodDef unicodedata_functions[] = { 1112 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__}, 1113 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__}, 1114 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__}, 1115 {"category", unicodedata_category, METH_VARARGS, 1116 unicodedata_category__doc__}, 1117 {"bidirectional", unicodedata_bidirectional, METH_VARARGS, 1118 unicodedata_bidirectional__doc__}, 1119 {"combining", unicodedata_combining, METH_VARARGS, 1120 unicodedata_combining__doc__}, 1121 {"mirrored", unicodedata_mirrored, METH_VARARGS, 1122 unicodedata_mirrored__doc__}, 1123 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS, 1124 unicodedata_east_asian_width__doc__}, 1125 {"decomposition", unicodedata_decomposition, METH_VARARGS, 1126 unicodedata_decomposition__doc__}, 1127 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__}, 1128 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, 1129 {"normalize", unicodedata_normalize, METH_VARARGS, 1130 unicodedata_normalize__doc__}, 1131 {NULL, NULL} /* sentinel */ 1132}; 1133 1134static PyTypeObject UCD_Type = { 1135 /* The ob_type field must be initialized in the module init function 1136 * to be portable to Windows without using C++. */ 1137 PyObject_HEAD_INIT(NULL) 1138 0, /*ob_size*/ 1139 "unicodedata.UCD", /*tp_name*/ 1140 sizeof(PreviousDBVersion), /*tp_basicsize*/ 1141 0, /*tp_itemsize*/ 1142 /* methods */ 1143 (destructor)PyObject_Del, /*tp_dealloc*/ 1144 0, /*tp_print*/ 1145 0, /*tp_getattr*/ 1146 0, /*tp_setattr*/ 1147 0, /*tp_compare*/ 1148 0, /*tp_repr*/ 1149 0, /*tp_as_number*/ 1150 0, /*tp_as_sequence*/ 1151 0, /*tp_as_mapping*/ 1152 0, /*tp_hash*/ 1153 0, /*tp_call*/ 1154 0, /*tp_str*/ 1155 PyObject_GenericGetAttr,/*tp_getattro*/ 1156 0, /*tp_setattro*/ 1157 0, /*tp_as_buffer*/ 1158 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 1159 0, /*tp_doc*/ 1160 0, /*tp_traverse*/ 1161 0, /*tp_clear*/ 1162 0, /*tp_richcompare*/ 1163 0, /*tp_weaklistoffset*/ 1164 0, /*tp_iter*/ 1165 0, /*tp_iternext*/ 1166 unicodedata_functions, /*tp_methods*/ 1167 DB_members, /*tp_members*/ 1168 0, /*tp_getset*/ 1169 0, /*tp_base*/ 1170 0, /*tp_dict*/ 1171 0, /*tp_descr_get*/ 1172 0, /*tp_descr_set*/ 1173 0, /*tp_dictoffset*/ 1174 0, /*tp_init*/ 1175 0, /*tp_alloc*/ 1176 0, /*tp_new*/ 1177 0, /*tp_free*/ 1178 0, /*tp_is_gc*/ 1179}; 1180 1181PyDoc_STRVAR(unicodedata_docstring, 1182"This module provides access to the Unicode Character Database which\n\ 1183defines character properties for all Unicode characters. The data in\n\ 1184this database is based on the UnicodeData.txt file version\n\ 11854.1.0 which is publically available from ftp://ftp.unicode.org/.\n\ 1186\n\ 1187The module uses the same names and symbols as defined by the\n\ 1188UnicodeData File Format 4.1.0 (see\n\ 1189http://www.unicode.org/Public/4.1.0/ucd/UCD.html)."); 1190 1191PyMODINIT_FUNC 1192initunicodedata(void) 1193{ 1194 PyObject *m, *v; 1195 1196 UCD_Type.ob_type = &PyType_Type; 1197 1198 m = Py_InitModule3( 1199 "unicodedata", unicodedata_functions, unicodedata_docstring); 1200 if (!m) 1201 return; 1202 1203 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); 1204 Py_INCREF(&UCD_Type); 1205 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); 1206 1207 /* Previous versions */ 1208 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); 1209 if (v != NULL) 1210 PyModule_AddObject(m, "ucd_3_2_0", v); 1211 1212 /* Export C API */ 1213 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); 1214 if (v != NULL) 1215 PyModule_AddObject(m, "ucnhash_CAPI", v); 1216} 1217 1218/* 1219Local variables: 1220c-basic-offset: 4 1221indent-tabs-mode: nil 1222End: 1223*/ 1224