unicodedata.c revision e988e286b2831382deb7c69b26c74ed185f51696
1/* ------------------------------------------------------------------------ 2 3 unicodedata -- Provides access to the Unicode 5.1 data base. 4 5 Data was extracted from the Unicode 5.1 UnicodeData.txt file. 6 7 Written by Marc-Andre Lemburg (mal@lemburg.com). 8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) 9 Modified by Martin v. L�wis (martin@v.loewis.de) 10 11 Copyright (c) Corporation for National Research Initiatives. 12 13 ------------------------------------------------------------------------ */ 14 15#include "Python.h" 16#include "ucnhash.h" 17#include "structmember.h" 18 19/* character properties */ 20 21typedef struct { 22 const unsigned char category; /* index into 23 _PyUnicode_CategoryNames */ 24 const unsigned char combining; /* combining class value 0 - 255 */ 25 const unsigned char bidirectional; /* index into 26 _PyUnicode_BidirectionalNames */ 27 const unsigned char mirrored; /* true if mirrored in bidir mode */ 28 const unsigned char east_asian_width; /* index into 29 _PyUnicode_EastAsianWidth */ 30 const unsigned char normalization_quick_check; /* see is_normalized() */ 31} _PyUnicode_DatabaseRecord; 32 33typedef struct change_record { 34 /* sequence of fields should be the same as in merge_old_version */ 35 const unsigned char bidir_changed; 36 const unsigned char category_changed; 37 const unsigned char decimal_changed; 38 const unsigned char mirrored_changed; 39 const int numeric_changed; 40} change_record; 41 42/* data file generated by Tools/unicode/makeunicodedata.py */ 43#include "unicodedata_db.h" 44 45static const _PyUnicode_DatabaseRecord* 46_getrecord_ex(Py_UCS4 code) 47{ 48 int index; 49 if (code >= 0x110000) 50 index = 0; 51 else { 52 index = index1[(code>>SHIFT)]; 53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; 54 } 55 56 return &_PyUnicode_Database_Records[index]; 57} 58 59/* ------------- Previous-version API ------------------------------------- */ 60typedef struct previous_version { 61 PyObject_HEAD 62 const char *name; 63 const change_record* (*getrecord)(Py_UCS4); 64 Py_UCS4 (*normalization)(Py_UCS4); 65} PreviousDBVersion; 66 67#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v)) 68 69static PyMemberDef DB_members[] = { 70 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, 71 {NULL} 72}; 73 74/* forward declaration */ 75static PyTypeObject UCD_Type; 76 77static PyObject* 78new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), 79 Py_UCS4 (*normalization)(Py_UCS4)) 80{ 81 PreviousDBVersion *self; 82 self = PyObject_New(PreviousDBVersion, &UCD_Type); 83 if (self == NULL) 84 return NULL; 85 self->name = name; 86 self->getrecord = getrecord; 87 self->normalization = normalization; 88 return (PyObject*)self; 89} 90 91 92static Py_UCS4 getuchar(PyUnicodeObject *obj) 93{ 94 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj); 95 96 if (PyUnicode_GET_SIZE(obj) == 1) 97 return *v; 98#ifndef Py_UNICODE_WIDE 99 else if ((PyUnicode_GET_SIZE(obj) == 2) && 100 (0xD800 <= v[0] && v[0] <= 0xDBFF) && 101 (0xDC00 <= v[1] && v[1] <= 0xDFFF)) 102 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000; 103#endif 104 PyErr_SetString(PyExc_TypeError, 105 "need a single Unicode character as parameter"); 106 return (Py_UCS4)-1; 107} 108 109/* --- Module API --------------------------------------------------------- */ 110 111PyDoc_STRVAR(unicodedata_decimal__doc__, 112"decimal(unichr[, default])\n\ 113\n\ 114Returns the decimal value assigned to the Unicode character unichr\n\ 115as integer. If no such value is defined, default is returned, or, if\n\ 116not given, ValueError is raised."); 117 118static PyObject * 119unicodedata_decimal(PyObject *self, PyObject *args) 120{ 121 PyUnicodeObject *v; 122 PyObject *defobj = NULL; 123 int have_old = 0; 124 long rc; 125 Py_UCS4 c; 126 127 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) 128 return NULL; 129 c = getuchar(v); 130 if (c == (Py_UCS4)-1) 131 return NULL; 132 133 if (self) { 134 const change_record *old = get_old_record(self, c); 135 if (old->category_changed == 0) { 136 /* unassigned */ 137 have_old = 1; 138 rc = -1; 139 } 140 else if (old->decimal_changed != 0xFF) { 141 have_old = 1; 142 rc = old->decimal_changed; 143 } 144 } 145 146 if (!have_old) 147 rc = Py_UNICODE_TODECIMAL(c); 148 if (rc < 0) { 149 if (defobj == NULL) { 150 PyErr_SetString(PyExc_ValueError, 151 "not a decimal"); 152 return NULL; 153 } 154 else { 155 Py_INCREF(defobj); 156 return defobj; 157 } 158 } 159 return PyInt_FromLong(rc); 160} 161 162PyDoc_STRVAR(unicodedata_digit__doc__, 163"digit(unichr[, default])\n\ 164\n\ 165Returns the digit value assigned to the Unicode character unichr as\n\ 166integer. If no such value is defined, default is returned, or, if\n\ 167not given, ValueError is raised."); 168 169static PyObject * 170unicodedata_digit(PyObject *self, PyObject *args) 171{ 172 PyUnicodeObject *v; 173 PyObject *defobj = NULL; 174 long rc; 175 Py_UCS4 c; 176 177 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj)) 178 return NULL; 179 c = getuchar(v); 180 if (c == (Py_UCS4)-1) 181 return NULL; 182 rc = Py_UNICODE_TODIGIT(c); 183 if (rc < 0) { 184 if (defobj == NULL) { 185 PyErr_SetString(PyExc_ValueError, "not a digit"); 186 return NULL; 187 } 188 else { 189 Py_INCREF(defobj); 190 return defobj; 191 } 192 } 193 return PyInt_FromLong(rc); 194} 195 196PyDoc_STRVAR(unicodedata_numeric__doc__, 197"numeric(unichr[, default])\n\ 198\n\ 199Returns the numeric value assigned to the Unicode character unichr\n\ 200as float. If no such value is defined, default is returned, or, if\n\ 201not given, ValueError is raised."); 202 203static PyObject * 204unicodedata_numeric(PyObject *self, PyObject *args) 205{ 206 PyUnicodeObject *v; 207 PyObject *defobj = NULL; 208 int have_old = 0; 209 double rc; 210 Py_UCS4 c; 211 212 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) 213 return NULL; 214 c = getuchar(v); 215 if (c == (Py_UCS4)-1) 216 return NULL; 217 218 if (self) { 219 const change_record *old = get_old_record(self, c); 220 if (old->category_changed == 0) { 221 /* unassigned */ 222 have_old = 1; 223 rc = -1.0; 224 } 225 else if (old->decimal_changed != 0xFF) { 226 have_old = 1; 227 rc = old->decimal_changed; 228 } 229 } 230 231 if (!have_old) 232 rc = Py_UNICODE_TONUMERIC(c); 233 if (rc == -1.0) { 234 if (defobj == NULL) { 235 PyErr_SetString(PyExc_ValueError, "not a numeric character"); 236 return NULL; 237 } 238 else { 239 Py_INCREF(defobj); 240 return defobj; 241 } 242 } 243 return PyFloat_FromDouble(rc); 244} 245 246PyDoc_STRVAR(unicodedata_category__doc__, 247"category(unichr)\n\ 248\n\ 249Returns the general category assigned to the Unicode character\n\ 250unichr as string."); 251 252static PyObject * 253unicodedata_category(PyObject *self, PyObject *args) 254{ 255 PyUnicodeObject *v; 256 int index; 257 Py_UCS4 c; 258 259 if (!PyArg_ParseTuple(args, "O!:category", 260 &PyUnicode_Type, &v)) 261 return NULL; 262 c = getuchar(v); 263 if (c == (Py_UCS4)-1) 264 return NULL; 265 index = (int) _getrecord_ex(c)->category; 266 if (self) { 267 const change_record *old = get_old_record(self, c); 268 if (old->category_changed != 0xFF) 269 index = old->category_changed; 270 } 271 return PyString_FromString(_PyUnicode_CategoryNames[index]); 272} 273 274PyDoc_STRVAR(unicodedata_bidirectional__doc__, 275"bidirectional(unichr)\n\ 276\n\ 277Returns the bidirectional category assigned to the Unicode character\n\ 278unichr as string. If no such value is defined, an empty string is\n\ 279returned."); 280 281static PyObject * 282unicodedata_bidirectional(PyObject *self, PyObject *args) 283{ 284 PyUnicodeObject *v; 285 int index; 286 Py_UCS4 c; 287 288 if (!PyArg_ParseTuple(args, "O!:bidirectional", 289 &PyUnicode_Type, &v)) 290 return NULL; 291 c = getuchar(v); 292 if (c == (Py_UCS4)-1) 293 return NULL; 294 index = (int) _getrecord_ex(c)->bidirectional; 295 if (self) { 296 const change_record *old = get_old_record(self, c); 297 if (old->category_changed == 0) 298 index = 0; /* unassigned */ 299 else if (old->bidir_changed != 0xFF) 300 index = old->bidir_changed; 301 } 302 return PyString_FromString(_PyUnicode_BidirectionalNames[index]); 303} 304 305PyDoc_STRVAR(unicodedata_combining__doc__, 306"combining(unichr)\n\ 307\n\ 308Returns the canonical combining class assigned to the Unicode\n\ 309character unichr as integer. Returns 0 if no combining class is\n\ 310defined."); 311 312static PyObject * 313unicodedata_combining(PyObject *self, PyObject *args) 314{ 315 PyUnicodeObject *v; 316 int index; 317 Py_UCS4 c; 318 319 if (!PyArg_ParseTuple(args, "O!:combining", 320 &PyUnicode_Type, &v)) 321 return NULL; 322 c = getuchar(v); 323 if (c == (Py_UCS4)-1) 324 return NULL; 325 index = (int) _getrecord_ex(c)->combining; 326 if (self) { 327 const change_record *old = get_old_record(self, c); 328 if (old->category_changed == 0) 329 index = 0; /* unassigned */ 330 } 331 return PyInt_FromLong(index); 332} 333 334PyDoc_STRVAR(unicodedata_mirrored__doc__, 335"mirrored(unichr)\n\ 336\n\ 337Returns the mirrored property assigned to the Unicode character\n\ 338unichr as integer. Returns 1 if the character has been identified as\n\ 339a \"mirrored\" character in bidirectional text, 0 otherwise."); 340 341static PyObject * 342unicodedata_mirrored(PyObject *self, PyObject *args) 343{ 344 PyUnicodeObject *v; 345 int index; 346 Py_UCS4 c; 347 348 if (!PyArg_ParseTuple(args, "O!:mirrored", 349 &PyUnicode_Type, &v)) 350 return NULL; 351 c = getuchar(v); 352 if (c == (Py_UCS4)-1) 353 return NULL; 354 index = (int) _getrecord_ex(c)->mirrored; 355 if (self) { 356 const change_record *old = get_old_record(self, c); 357 if (old->category_changed == 0) 358 index = 0; /* unassigned */ 359 else if (old->mirrored_changed != 0xFF) 360 index = old->mirrored_changed; 361 } 362 return PyInt_FromLong(index); 363} 364 365PyDoc_STRVAR(unicodedata_east_asian_width__doc__, 366"east_asian_width(unichr)\n\ 367\n\ 368Returns the east asian width assigned to the Unicode character\n\ 369unichr as string."); 370 371static PyObject * 372unicodedata_east_asian_width(PyObject *self, PyObject *args) 373{ 374 PyUnicodeObject *v; 375 int index; 376 Py_UCS4 c; 377 378 if (!PyArg_ParseTuple(args, "O!:east_asian_width", 379 &PyUnicode_Type, &v)) 380 return NULL; 381 c = getuchar(v); 382 if (c == (Py_UCS4)-1) 383 return NULL; 384 index = (int) _getrecord_ex(c)->east_asian_width; 385 if (self) { 386 const change_record *old = get_old_record(self, c); 387 if (old->category_changed == 0) 388 index = 0; /* unassigned */ 389 } 390 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); 391} 392 393PyDoc_STRVAR(unicodedata_decomposition__doc__, 394"decomposition(unichr)\n\ 395\n\ 396Returns the character decomposition mapping assigned to the Unicode\n\ 397character unichr as string. An empty string is returned in case no\n\ 398such mapping is defined."); 399 400static PyObject * 401unicodedata_decomposition(PyObject *self, PyObject *args) 402{ 403 PyUnicodeObject *v; 404 char decomp[256]; 405 int code, index, count, i; 406 unsigned int prefix_index; 407 Py_UCS4 c; 408 409 if (!PyArg_ParseTuple(args, "O!:decomposition", 410 &PyUnicode_Type, &v)) 411 return NULL; 412 c = getuchar(v); 413 if (c == (Py_UCS4)-1) 414 return NULL; 415 416 code = (int)c; 417 418 if (self) { 419 const change_record *old = get_old_record(self, c); 420 if (old->category_changed == 0) 421 return PyString_FromString(""); /* unassigned */ 422 } 423 424 if (code < 0 || code >= 0x110000) 425 index = 0; 426 else { 427 index = decomp_index1[(code>>DECOMP_SHIFT)]; 428 index = decomp_index2[(index<<DECOMP_SHIFT)+ 429 (code&((1<<DECOMP_SHIFT)-1))]; 430 } 431 432 /* high byte is number of hex bytes (usually one or two), low byte 433 is prefix code (from*/ 434 count = decomp_data[index] >> 8; 435 436 /* XXX: could allocate the PyString up front instead 437 (strlen(prefix) + 5 * count + 1 bytes) */ 438 439 /* Based on how index is calculated above and decomp_data is generated 440 from Tools/unicode/makeunicodedata.py, it should not be possible 441 to overflow decomp_prefix. */ 442 prefix_index = decomp_data[index] & 255; 443 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix))); 444 445 /* copy prefix */ 446 i = strlen(decomp_prefix[prefix_index]); 447 memcpy(decomp, decomp_prefix[prefix_index], i); 448 449 while (count-- > 0) { 450 if (i) 451 decomp[i++] = ' '; 452 assert((size_t)i < sizeof(decomp)); 453 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", 454 decomp_data[++index]); 455 i += strlen(decomp + i); 456 } 457 458 decomp[i] = '\0'; 459 460 return PyString_FromString(decomp); 461} 462 463static void 464get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) 465{ 466 if (code >= 0x110000) { 467 *index = 0; 468 } else if (self && get_old_record(self, code)->category_changed==0) { 469 /* unassigned in old version */ 470 *index = 0; 471 } 472 else { 473 *index = decomp_index1[(code>>DECOMP_SHIFT)]; 474 *index = decomp_index2[(*index<<DECOMP_SHIFT)+ 475 (code&((1<<DECOMP_SHIFT)-1))]; 476 } 477 478 /* high byte is number of hex bytes (usually one or two), low byte 479 is prefix code (from*/ 480 *count = decomp_data[*index] >> 8; 481 *prefix = decomp_data[*index] & 255; 482 483 (*index)++; 484} 485 486#define SBase 0xAC00 487#define LBase 0x1100 488#define VBase 0x1161 489#define TBase 0x11A7 490#define LCount 19 491#define VCount 21 492#define TCount 28 493#define NCount (VCount*TCount) 494#define SCount (LCount*NCount) 495 496static PyObject* 497nfd_nfkd(PyObject *self, PyObject *input, int k) 498{ 499 PyObject *result; 500 Py_UNICODE *i, *end, *o; 501 /* Longest decomposition in Unicode 3.2: U+FDFA */ 502 Py_UNICODE stack[20]; 503 Py_ssize_t space, isize; 504 int index, prefix, count, stackptr; 505 unsigned char prev, cur; 506 507 stackptr = 0; 508 isize = PyUnicode_GET_SIZE(input); 509 /* Overallocate atmost 10 characters. */ 510 space = (isize > 10 ? 10 : isize) + isize; 511 result = PyUnicode_FromUnicode(NULL, space); 512 if (!result) 513 return NULL; 514 i = PyUnicode_AS_UNICODE(input); 515 end = i + isize; 516 o = PyUnicode_AS_UNICODE(result); 517 518 while (i < end) { 519 stack[stackptr++] = *i++; 520 while(stackptr) { 521 Py_UNICODE code = stack[--stackptr]; 522 /* Hangul Decomposition adds three characters in 523 a single step, so we need atleast that much room. */ 524 if (space < 3) { 525 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10; 526 space += 10; 527 if (PyUnicode_Resize(&result, newsize) == -1) 528 return NULL; 529 o = PyUnicode_AS_UNICODE(result) + newsize - space; 530 } 531 /* Hangul Decomposition. */ 532 if (SBase <= code && code < (SBase+SCount)) { 533 int SIndex = code - SBase; 534 int L = LBase + SIndex / NCount; 535 int V = VBase + (SIndex % NCount) / TCount; 536 int T = TBase + SIndex % TCount; 537 *o++ = L; 538 *o++ = V; 539 space -= 2; 540 if (T != TBase) { 541 *o++ = T; 542 space --; 543 } 544 continue; 545 } 546 /* normalization changes */ 547 if (self) { 548 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); 549 if (value != 0) { 550 stack[stackptr++] = value; 551 continue; 552 } 553 } 554 555 /* Other decompositions. */ 556 get_decomp_record(self, code, &index, &prefix, &count); 557 558 /* Copy character if it is not decomposable, or has a 559 compatibility decomposition, but we do NFD. */ 560 if (!count || (prefix && !k)) { 561 *o++ = code; 562 space--; 563 continue; 564 } 565 /* Copy decomposition onto the stack, in reverse 566 order. */ 567 while(count) { 568 code = decomp_data[index + (--count)]; 569 stack[stackptr++] = code; 570 } 571 } 572 } 573 574 /* Drop overallocation. Cannot fail. */ 575 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space); 576 577 /* Sort canonically. */ 578 i = PyUnicode_AS_UNICODE(result); 579 prev = _getrecord_ex(*i)->combining; 580 end = i + PyUnicode_GET_SIZE(result); 581 for (i++; i < end; i++) { 582 cur = _getrecord_ex(*i)->combining; 583 if (prev == 0 || cur == 0 || prev <= cur) { 584 prev = cur; 585 continue; 586 } 587 /* Non-canonical order. Need to switch *i with previous. */ 588 o = i - 1; 589 while (1) { 590 Py_UNICODE tmp = o[1]; 591 o[1] = o[0]; 592 o[0] = tmp; 593 o--; 594 if (o < PyUnicode_AS_UNICODE(result)) 595 break; 596 prev = _getrecord_ex(*o)->combining; 597 if (prev == 0 || prev <= cur) 598 break; 599 } 600 prev = _getrecord_ex(*i)->combining; 601 } 602 return result; 603} 604 605static int 606find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code) 607{ 608 int index; 609 for (index = 0; nfc[index].start; index++) { 610 int start = nfc[index].start; 611 if (code < start) 612 return -1; 613 if (code <= start + nfc[index].count) { 614 int delta = code - start; 615 return nfc[index].index + delta; 616 } 617 } 618 return -1; 619} 620 621static PyObject* 622nfc_nfkc(PyObject *self, PyObject *input, int k) 623{ 624 PyObject *result; 625 Py_UNICODE *i, *i1, *o, *end; 626 int f,l,index,index1,comb; 627 Py_UNICODE code; 628 Py_UNICODE *skipped[20]; 629 int cskipped = 0; 630 631 result = nfd_nfkd(self, input, k); 632 if (!result) 633 return NULL; 634 635 /* We are going to modify result in-place. 636 If nfd_nfkd is changed to sometimes return the input, 637 this code needs to be reviewed. */ 638 assert(result != input); 639 640 i = PyUnicode_AS_UNICODE(result); 641 end = i + PyUnicode_GET_SIZE(result); 642 o = PyUnicode_AS_UNICODE(result); 643 644 again: 645 while (i < end) { 646 for (index = 0; index < cskipped; index++) { 647 if (skipped[index] == i) { 648 /* *i character is skipped. 649 Remove from list. */ 650 skipped[index] = skipped[cskipped-1]; 651 cskipped--; 652 i++; 653 goto again; /* continue while */ 654 } 655 } 656 /* Hangul Composition. We don't need to check for <LV,T> 657 pairs, since we always have decomposed data. */ 658 if (LBase <= *i && *i < (LBase+LCount) && 659 i + 1 < end && 660 VBase <= i[1] && i[1] <= (VBase+VCount)) { 661 int LIndex, VIndex; 662 LIndex = i[0] - LBase; 663 VIndex = i[1] - VBase; 664 code = SBase + (LIndex*VCount+VIndex)*TCount; 665 i+=2; 666 if (i < end && 667 TBase <= *i && *i <= (TBase+TCount)) { 668 code += *i-TBase; 669 i++; 670 } 671 *o++ = code; 672 continue; 673 } 674 675 f = find_nfc_index(self, nfc_first, *i); 676 if (f == -1) { 677 *o++ = *i++; 678 continue; 679 } 680 /* Find next unblocked character. */ 681 i1 = i+1; 682 comb = 0; 683 while (i1 < end) { 684 int comb1 = _getrecord_ex(*i1)->combining; 685 if (comb1 && comb == comb1) { 686 /* Character is blocked. */ 687 i1++; 688 continue; 689 } 690 l = find_nfc_index(self, nfc_last, *i1); 691 /* *i1 cannot be combined with *i. If *i1 692 is a starter, we don't need to look further. 693 Otherwise, record the combining class. */ 694 if (l == -1) { 695 not_combinable: 696 if (comb1 == 0) 697 break; 698 comb = comb1; 699 i1++; 700 continue; 701 } 702 index = f*TOTAL_LAST + l; 703 index1 = comp_index[index >> COMP_SHIFT]; 704 code = comp_data[(index1<<COMP_SHIFT)+ 705 (index&((1<<COMP_SHIFT)-1))]; 706 if (code == 0) 707 goto not_combinable; 708 709 /* Replace the original character. */ 710 *i = code; 711 /* Mark the second character unused. */ 712 skipped[cskipped++] = i1; 713 i1++; 714 f = find_nfc_index(self, nfc_first, *i); 715 if (f == -1) 716 break; 717 } 718 *o++ = *i++; 719 } 720 if (o != end) 721 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result)); 722 return result; 723} 724 725/* Return 1 if the input is certainly normalized, 0 if it might not be. */ 726static int 727is_normalized(PyObject *self, PyObject *input, int nfc, int k) 728{ 729 Py_UNICODE *i, *end; 730 unsigned char prev_combining = 0, quickcheck_mask; 731 732 /* An older version of the database is requested, quickchecks must be 733 disabled. */ 734 if (self != NULL) 735 return 0; 736 737 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, 738 as described in http://unicode.org/reports/tr15/#Annex8. */ 739 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); 740 741 i = PyUnicode_AS_UNICODE(input); 742 end = i + PyUnicode_GET_SIZE(input); 743 while (i < end) { 744 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++); 745 unsigned char combining = record->combining; 746 unsigned char quickcheck = record->normalization_quick_check; 747 748 if (quickcheck & quickcheck_mask) 749 return 0; /* this string might need normalization */ 750 if (combining && prev_combining > combining) 751 return 0; /* non-canonical sort order, not normalized */ 752 prev_combining = combining; 753 } 754 return 1; /* certainly normalized */ 755} 756 757PyDoc_STRVAR(unicodedata_normalize__doc__, 758"normalize(form, unistr)\n\ 759\n\ 760Return the normal form 'form' for the Unicode string unistr. Valid\n\ 761values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'."); 762 763static PyObject* 764unicodedata_normalize(PyObject *self, PyObject *args) 765{ 766 char *form; 767 PyObject *input; 768 769 if(!PyArg_ParseTuple(args, "sO!:normalize", 770 &form, &PyUnicode_Type, &input)) 771 return NULL; 772 773 if (PyUnicode_GetSize(input) == 0) { 774 /* Special case empty input strings, since resizing 775 them later would cause internal errors. */ 776 Py_INCREF(input); 777 return input; 778 } 779 780 if (strcmp(form, "NFC") == 0) { 781 if (is_normalized(self, input, 1, 0)) { 782 Py_INCREF(input); 783 return input; 784 } 785 return nfc_nfkc(self, input, 0); 786 } 787 if (strcmp(form, "NFKC") == 0) { 788 if (is_normalized(self, input, 1, 1)) { 789 Py_INCREF(input); 790 return input; 791 } 792 return nfc_nfkc(self, input, 1); 793 } 794 if (strcmp(form, "NFD") == 0) { 795 if (is_normalized(self, input, 0, 0)) { 796 Py_INCREF(input); 797 return input; 798 } 799 return nfd_nfkd(self, input, 0); 800 } 801 if (strcmp(form, "NFKD") == 0) { 802 if (is_normalized(self, input, 0, 1)) { 803 Py_INCREF(input); 804 return input; 805 } 806 return nfd_nfkd(self, input, 1); 807 } 808 PyErr_SetString(PyExc_ValueError, "invalid normalization form"); 809 return NULL; 810} 811 812/* -------------------------------------------------------------------- */ 813/* unicode character name tables */ 814 815/* data file generated by Tools/unicode/makeunicodedata.py */ 816#include "unicodename_db.h" 817 818/* -------------------------------------------------------------------- */ 819/* database code (cut and pasted from the unidb package) */ 820 821static unsigned long 822_gethash(const char *s, int len, int scale) 823{ 824 int i; 825 unsigned long h = 0; 826 unsigned long ix; 827 for (i = 0; i < len; i++) { 828 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i])); 829 ix = h & 0xff000000; 830 if (ix) 831 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff; 832 } 833 return h; 834} 835 836static char *hangul_syllables[][3] = { 837 { "G", "A", "" }, 838 { "GG", "AE", "G" }, 839 { "N", "YA", "GG" }, 840 { "D", "YAE", "GS" }, 841 { "DD", "EO", "N", }, 842 { "R", "E", "NJ" }, 843 { "M", "YEO", "NH" }, 844 { "B", "YE", "D" }, 845 { "BB", "O", "L" }, 846 { "S", "WA", "LG" }, 847 { "SS", "WAE", "LM" }, 848 { "", "OE", "LB" }, 849 { "J", "YO", "LS" }, 850 { "JJ", "U", "LT" }, 851 { "C", "WEO", "LP" }, 852 { "K", "WE", "LH" }, 853 { "T", "WI", "M" }, 854 { "P", "YU", "B" }, 855 { "H", "EU", "BS" }, 856 { 0, "YI", "S" }, 857 { 0, "I", "SS" }, 858 { 0, 0, "NG" }, 859 { 0, 0, "J" }, 860 { 0, 0, "C" }, 861 { 0, 0, "K" }, 862 { 0, 0, "T" }, 863 { 0, 0, "P" }, 864 { 0, 0, "H" } 865}; 866 867static int 868is_unified_ideograph(Py_UCS4 code) 869{ 870 return ( 871 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ 872 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */ 873 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */ 874} 875 876static int 877_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen) 878{ 879 int offset; 880 int i; 881 int word; 882 unsigned char* w; 883 884 if (code >= 0x110000) 885 return 0; 886 887 if (self) { 888 const change_record *old = get_old_record(self, code); 889 if (old->category_changed == 0) { 890 /* unassigned */ 891 return 0; 892 } 893 } 894 895 if (SBase <= code && code < SBase+SCount) { 896 /* Hangul syllable. */ 897 int SIndex = code - SBase; 898 int L = SIndex / NCount; 899 int V = (SIndex % NCount) / TCount; 900 int T = SIndex % TCount; 901 902 if (buflen < 27) 903 /* Worst case: HANGUL SYLLABLE <10chars>. */ 904 return 0; 905 strcpy(buffer, "HANGUL SYLLABLE "); 906 buffer += 16; 907 strcpy(buffer, hangul_syllables[L][0]); 908 buffer += strlen(hangul_syllables[L][0]); 909 strcpy(buffer, hangul_syllables[V][1]); 910 buffer += strlen(hangul_syllables[V][1]); 911 strcpy(buffer, hangul_syllables[T][2]); 912 buffer += strlen(hangul_syllables[T][2]); 913 *buffer = '\0'; 914 return 1; 915 } 916 917 if (is_unified_ideograph(code)) { 918 if (buflen < 28) 919 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ 920 return 0; 921 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); 922 return 1; 923 } 924 925 /* get offset into phrasebook */ 926 offset = phrasebook_offset1[(code>>phrasebook_shift)]; 927 offset = phrasebook_offset2[(offset<<phrasebook_shift) + 928 (code&((1<<phrasebook_shift)-1))]; 929 if (!offset) 930 return 0; 931 932 i = 0; 933 934 for (;;) { 935 /* get word index */ 936 word = phrasebook[offset] - phrasebook_short; 937 if (word >= 0) { 938 word = (word << 8) + phrasebook[offset+1]; 939 offset += 2; 940 } else 941 word = phrasebook[offset++]; 942 if (i) { 943 if (i > buflen) 944 return 0; /* buffer overflow */ 945 buffer[i++] = ' '; 946 } 947 /* copy word string from lexicon. the last character in the 948 word has bit 7 set. the last word in a string ends with 949 0x80 */ 950 w = lexicon + lexicon_offset[word]; 951 while (*w < 128) { 952 if (i >= buflen) 953 return 0; /* buffer overflow */ 954 buffer[i++] = *w++; 955 } 956 if (i >= buflen) 957 return 0; /* buffer overflow */ 958 buffer[i++] = *w & 127; 959 if (*w == 128) 960 break; /* end of word */ 961 } 962 963 return 1; 964} 965 966static int 967_cmpname(PyObject *self, int code, const char* name, int namelen) 968{ 969 /* check if code corresponds to the given name */ 970 int i; 971 char buffer[NAME_MAXLEN]; 972 if (!_getucname(self, code, buffer, sizeof(buffer))) 973 return 0; 974 for (i = 0; i < namelen; i++) { 975 if (toupper(Py_CHARMASK(name[i])) != buffer[i]) 976 return 0; 977 } 978 return buffer[namelen] == '\0'; 979} 980 981static void 982find_syllable(const char *str, int *len, int *pos, int count, int column) 983{ 984 int i, len1; 985 *len = -1; 986 for (i = 0; i < count; i++) { 987 char *s = hangul_syllables[i][column]; 988 len1 = strlen(s); 989 if (len1 <= *len) 990 continue; 991 if (strncmp(str, s, len1) == 0) { 992 *len = len1; 993 *pos = i; 994 } 995 } 996 if (*len == -1) { 997 *len = 0; 998 } 999} 1000 1001static int 1002_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) 1003{ 1004 unsigned int h, v; 1005 unsigned int mask = code_size-1; 1006 unsigned int i, incr; 1007 1008 /* Check for hangul syllables. */ 1009 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { 1010 int len, L = -1, V = -1, T = -1; 1011 const char *pos = name + 16; 1012 find_syllable(pos, &len, &L, LCount, 0); 1013 pos += len; 1014 find_syllable(pos, &len, &V, VCount, 1); 1015 pos += len; 1016 find_syllable(pos, &len, &T, TCount, 2); 1017 pos += len; 1018 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { 1019 *code = SBase + (L*VCount+V)*TCount + T; 1020 return 1; 1021 } 1022 /* Otherwise, it's an illegal syllable name. */ 1023 return 0; 1024 } 1025 1026 /* Check for unified ideographs. */ 1027 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { 1028 /* Four or five hexdigits must follow. */ 1029 v = 0; 1030 name += 22; 1031 namelen -= 22; 1032 if (namelen != 4 && namelen != 5) 1033 return 0; 1034 while (namelen--) { 1035 v *= 16; 1036 if (*name >= '0' && *name <= '9') 1037 v += *name - '0'; 1038 else if (*name >= 'A' && *name <= 'F') 1039 v += *name - 'A' + 10; 1040 else 1041 return 0; 1042 name++; 1043 } 1044 if (!is_unified_ideograph(v)) 1045 return 0; 1046 *code = v; 1047 return 1; 1048 } 1049 1050 /* the following is the same as python's dictionary lookup, with 1051 only minor changes. see the makeunicodedata script for more 1052 details */ 1053 1054 h = (unsigned int) _gethash(name, namelen, code_magic); 1055 i = (~h) & mask; 1056 v = code_hash[i]; 1057 if (!v) 1058 return 0; 1059 if (_cmpname(self, v, name, namelen)) { 1060 *code = v; 1061 return 1; 1062 } 1063 incr = (h ^ (h >> 3)) & mask; 1064 if (!incr) 1065 incr = mask; 1066 for (;;) { 1067 i = (i + incr) & mask; 1068 v = code_hash[i]; 1069 if (!v) 1070 return 0; 1071 if (_cmpname(self, v, name, namelen)) { 1072 *code = v; 1073 return 1; 1074 } 1075 incr = incr << 1; 1076 if (incr > mask) 1077 incr = incr ^ code_poly; 1078 } 1079} 1080 1081static const _PyUnicode_Name_CAPI hashAPI = 1082{ 1083 sizeof(_PyUnicode_Name_CAPI), 1084 _getucname, 1085 _getcode 1086}; 1087 1088/* -------------------------------------------------------------------- */ 1089/* Python bindings */ 1090 1091PyDoc_STRVAR(unicodedata_name__doc__, 1092"name(unichr[, default])\n\ 1093Returns the name assigned to the Unicode character unichr as a\n\ 1094string. If no name is defined, default is returned, or, if not\n\ 1095given, ValueError is raised."); 1096 1097static PyObject * 1098unicodedata_name(PyObject* self, PyObject* args) 1099{ 1100 char name[NAME_MAXLEN]; 1101 Py_UCS4 c; 1102 1103 PyUnicodeObject* v; 1104 PyObject* defobj = NULL; 1105 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) 1106 return NULL; 1107 1108 c = getuchar(v); 1109 if (c == (Py_UCS4)-1) 1110 return NULL; 1111 1112 if (!_getucname(self, c, name, sizeof(name))) { 1113 if (defobj == NULL) { 1114 PyErr_SetString(PyExc_ValueError, "no such name"); 1115 return NULL; 1116 } 1117 else { 1118 Py_INCREF(defobj); 1119 return defobj; 1120 } 1121 } 1122 1123 return Py_BuildValue("s", name); 1124} 1125 1126PyDoc_STRVAR(unicodedata_lookup__doc__, 1127"lookup(name)\n\ 1128\n\ 1129Look up character by name. If a character with the\n\ 1130given name is found, return the corresponding Unicode\n\ 1131character. If not found, KeyError is raised."); 1132 1133static PyObject * 1134unicodedata_lookup(PyObject* self, PyObject* args) 1135{ 1136 Py_UCS4 code; 1137 Py_UNICODE str[2]; 1138 1139 char* name; 1140 int namelen; 1141 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) 1142 return NULL; 1143 1144 if (!_getcode(self, name, namelen, &code)) { 1145 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", 1146 name); 1147 return NULL; 1148 } 1149 1150#ifndef Py_UNICODE_WIDE 1151 if (code >= 0x10000) { 1152 str[0] = 0xd800 + ((code - 0x10000) >> 10); 1153 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff); 1154 return PyUnicode_FromUnicode(str, 2); 1155 } 1156#endif 1157 str[0] = (Py_UNICODE) code; 1158 return PyUnicode_FromUnicode(str, 1); 1159} 1160 1161/* XXX Add doc strings. */ 1162 1163static PyMethodDef unicodedata_functions[] = { 1164 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__}, 1165 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__}, 1166 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__}, 1167 {"category", unicodedata_category, METH_VARARGS, 1168 unicodedata_category__doc__}, 1169 {"bidirectional", unicodedata_bidirectional, METH_VARARGS, 1170 unicodedata_bidirectional__doc__}, 1171 {"combining", unicodedata_combining, METH_VARARGS, 1172 unicodedata_combining__doc__}, 1173 {"mirrored", unicodedata_mirrored, METH_VARARGS, 1174 unicodedata_mirrored__doc__}, 1175 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS, 1176 unicodedata_east_asian_width__doc__}, 1177 {"decomposition", unicodedata_decomposition, METH_VARARGS, 1178 unicodedata_decomposition__doc__}, 1179 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__}, 1180 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__}, 1181 {"normalize", unicodedata_normalize, METH_VARARGS, 1182 unicodedata_normalize__doc__}, 1183 {NULL, NULL} /* sentinel */ 1184}; 1185 1186static PyTypeObject UCD_Type = { 1187 /* The ob_type field must be initialized in the module init function 1188 * to be portable to Windows without using C++. */ 1189 PyVarObject_HEAD_INIT(NULL, 0) 1190 "unicodedata.UCD", /*tp_name*/ 1191 sizeof(PreviousDBVersion), /*tp_basicsize*/ 1192 0, /*tp_itemsize*/ 1193 /* methods */ 1194 (destructor)PyObject_Del, /*tp_dealloc*/ 1195 0, /*tp_print*/ 1196 0, /*tp_getattr*/ 1197 0, /*tp_setattr*/ 1198 0, /*tp_compare*/ 1199 0, /*tp_repr*/ 1200 0, /*tp_as_number*/ 1201 0, /*tp_as_sequence*/ 1202 0, /*tp_as_mapping*/ 1203 0, /*tp_hash*/ 1204 0, /*tp_call*/ 1205 0, /*tp_str*/ 1206 PyObject_GenericGetAttr,/*tp_getattro*/ 1207 0, /*tp_setattro*/ 1208 0, /*tp_as_buffer*/ 1209 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 1210 0, /*tp_doc*/ 1211 0, /*tp_traverse*/ 1212 0, /*tp_clear*/ 1213 0, /*tp_richcompare*/ 1214 0, /*tp_weaklistoffset*/ 1215 0, /*tp_iter*/ 1216 0, /*tp_iternext*/ 1217 unicodedata_functions, /*tp_methods*/ 1218 DB_members, /*tp_members*/ 1219 0, /*tp_getset*/ 1220 0, /*tp_base*/ 1221 0, /*tp_dict*/ 1222 0, /*tp_descr_get*/ 1223 0, /*tp_descr_set*/ 1224 0, /*tp_dictoffset*/ 1225 0, /*tp_init*/ 1226 0, /*tp_alloc*/ 1227 0, /*tp_new*/ 1228 0, /*tp_free*/ 1229 0, /*tp_is_gc*/ 1230}; 1231 1232PyDoc_STRVAR(unicodedata_docstring, 1233"This module provides access to the Unicode Character Database which\n\ 1234defines character properties for all Unicode characters. The data in\n\ 1235this database is based on the UnicodeData.txt file version\n\ 12365.1.0 which is publically available from ftp://ftp.unicode.org/.\n\ 1237\n\ 1238The module uses the same names and symbols as defined by the\n\ 1239UnicodeData File Format 5.1.0 (see\n\ 1240http://www.unicode.org/Public/5.1.0/ucd/UCD.html)."); 1241 1242PyMODINIT_FUNC 1243initunicodedata(void) 1244{ 1245 PyObject *m, *v; 1246 1247 Py_TYPE(&UCD_Type) = &PyType_Type; 1248 1249 m = Py_InitModule3( 1250 "unicodedata", unicodedata_functions, unicodedata_docstring); 1251 if (!m) 1252 return; 1253 1254 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); 1255 Py_INCREF(&UCD_Type); 1256 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); 1257 1258 /* Previous versions */ 1259 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); 1260 if (v != NULL) 1261 PyModule_AddObject(m, "ucd_3_2_0", v); 1262 1263 /* Export C API */ 1264 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); 1265 if (v != NULL) 1266 PyModule_AddObject(m, "ucnhash_CAPI", v); 1267} 1268 1269/* 1270Local variables: 1271c-basic-offset: 4 1272indent-tabs-mode: nil 1273End: 1274*/ 1275