unicodedata.c revision e988e286b2831382deb7c69b26c74ed185f51696
1/* ------------------------------------------------------------------------
2
3   unicodedata -- Provides access to the Unicode 5.1 data base.
4
5   Data was extracted from the Unicode 5.1 UnicodeData.txt file.
6
7   Written by Marc-Andre Lemburg (mal@lemburg.com).
8   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9   Modified by Martin v. L�wis (martin@v.loewis.de)
10
11   Copyright (c) Corporation for National Research Initiatives.
12
13   ------------------------------------------------------------------------ */
14
15#include "Python.h"
16#include "ucnhash.h"
17#include "structmember.h"
18
19/* character properties */
20
21typedef struct {
22    const unsigned char category;	/* index into
23					   _PyUnicode_CategoryNames */
24    const unsigned char	combining; 	/* combining class value 0 - 255 */
25    const unsigned char	bidirectional; 	/* index into
26					   _PyUnicode_BidirectionalNames */
27    const unsigned char mirrored;	/* true if mirrored in bidir mode */
28    const unsigned char east_asian_width;	/* index into
29						   _PyUnicode_EastAsianWidth */
30    const unsigned char normalization_quick_check; /* see is_normalized() */
31} _PyUnicode_DatabaseRecord;
32
33typedef struct change_record {
34    /* sequence of fields should be the same as in merge_old_version */
35    const unsigned char bidir_changed;
36    const unsigned char category_changed;
37    const unsigned char decimal_changed;
38    const unsigned char mirrored_changed;
39    const int numeric_changed;
40} change_record;
41
42/* data file generated by Tools/unicode/makeunicodedata.py */
43#include "unicodedata_db.h"
44
45static const _PyUnicode_DatabaseRecord*
46_getrecord_ex(Py_UCS4 code)
47{
48    int index;
49    if (code >= 0x110000)
50        index = 0;
51    else {
52        index = index1[(code>>SHIFT)];
53        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54    }
55
56    return &_PyUnicode_Database_Records[index];
57}
58
59/* ------------- Previous-version API ------------------------------------- */
60typedef struct previous_version {
61    PyObject_HEAD
62    const char *name;
63    const change_record* (*getrecord)(Py_UCS4);
64    Py_UCS4 (*normalization)(Py_UCS4);
65} PreviousDBVersion;
66
67#define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
68
69static PyMemberDef DB_members[] = {
70	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
71        {NULL}
72};
73
74/* forward declaration */
75static PyTypeObject UCD_Type;
76
77static PyObject*
78new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
79                     Py_UCS4 (*normalization)(Py_UCS4))
80{
81	PreviousDBVersion *self;
82	self = PyObject_New(PreviousDBVersion, &UCD_Type);
83	if (self == NULL)
84		return NULL;
85	self->name = name;
86	self->getrecord = getrecord;
87        self->normalization = normalization;
88	return (PyObject*)self;
89}
90
91
92static Py_UCS4 getuchar(PyUnicodeObject *obj)
93{
94    Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
95
96    if (PyUnicode_GET_SIZE(obj) == 1)
97	return *v;
98#ifndef Py_UNICODE_WIDE
99    else if ((PyUnicode_GET_SIZE(obj) == 2) &&
100             (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
101             (0xDC00 <= v[1] && v[1] <= 0xDFFF))
102	return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
103#endif
104    PyErr_SetString(PyExc_TypeError,
105                    "need a single Unicode character as parameter");
106    return (Py_UCS4)-1;
107}
108
109/* --- Module API --------------------------------------------------------- */
110
111PyDoc_STRVAR(unicodedata_decimal__doc__,
112"decimal(unichr[, default])\n\
113\n\
114Returns the decimal value assigned to the Unicode character unichr\n\
115as integer. If no such value is defined, default is returned, or, if\n\
116not given, ValueError is raised.");
117
118static PyObject *
119unicodedata_decimal(PyObject *self, PyObject *args)
120{
121    PyUnicodeObject *v;
122    PyObject *defobj = NULL;
123    int have_old = 0;
124    long rc;
125    Py_UCS4 c;
126
127    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
128        return NULL;
129    c = getuchar(v);
130    if (c == (Py_UCS4)-1)
131        return NULL;
132
133    if (self) {
134        const change_record *old = get_old_record(self, c);
135        if (old->category_changed == 0) {
136            /* unassigned */
137            have_old = 1;
138            rc = -1;
139        }
140        else if (old->decimal_changed != 0xFF) {
141            have_old = 1;
142            rc = old->decimal_changed;
143        }
144    }
145
146    if (!have_old)
147        rc = Py_UNICODE_TODECIMAL(c);
148    if (rc < 0) {
149	if (defobj == NULL) {
150	    PyErr_SetString(PyExc_ValueError,
151			    "not a decimal");
152            return NULL;
153	}
154	else {
155	    Py_INCREF(defobj);
156	    return defobj;
157	}
158    }
159    return PyInt_FromLong(rc);
160}
161
162PyDoc_STRVAR(unicodedata_digit__doc__,
163"digit(unichr[, default])\n\
164\n\
165Returns the digit value assigned to the Unicode character unichr as\n\
166integer. If no such value is defined, default is returned, or, if\n\
167not given, ValueError is raised.");
168
169static PyObject *
170unicodedata_digit(PyObject *self, PyObject *args)
171{
172    PyUnicodeObject *v;
173    PyObject *defobj = NULL;
174    long rc;
175    Py_UCS4 c;
176
177    if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
178        return NULL;
179    c = getuchar(v);
180    if (c == (Py_UCS4)-1)
181        return NULL;
182    rc = Py_UNICODE_TODIGIT(c);
183    if (rc < 0) {
184	if (defobj == NULL) {
185	    PyErr_SetString(PyExc_ValueError, "not a digit");
186            return NULL;
187	}
188	else {
189	    Py_INCREF(defobj);
190	    return defobj;
191	}
192    }
193    return PyInt_FromLong(rc);
194}
195
196PyDoc_STRVAR(unicodedata_numeric__doc__,
197"numeric(unichr[, default])\n\
198\n\
199Returns the numeric value assigned to the Unicode character unichr\n\
200as float. If no such value is defined, default is returned, or, if\n\
201not given, ValueError is raised.");
202
203static PyObject *
204unicodedata_numeric(PyObject *self, PyObject *args)
205{
206    PyUnicodeObject *v;
207    PyObject *defobj = NULL;
208    int have_old = 0;
209    double rc;
210    Py_UCS4 c;
211
212    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
213        return NULL;
214    c = getuchar(v);
215    if (c == (Py_UCS4)-1)
216        return NULL;
217
218    if (self) {
219        const change_record *old = get_old_record(self, c);
220        if (old->category_changed == 0) {
221            /* unassigned */
222            have_old = 1;
223            rc = -1.0;
224        }
225        else if (old->decimal_changed != 0xFF) {
226            have_old = 1;
227            rc = old->decimal_changed;
228        }
229    }
230
231    if (!have_old)
232        rc = Py_UNICODE_TONUMERIC(c);
233    if (rc == -1.0) {
234	if (defobj == NULL) {
235	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
236	    return NULL;
237	}
238	else {
239	    Py_INCREF(defobj);
240	    return defobj;
241	}
242    }
243    return PyFloat_FromDouble(rc);
244}
245
246PyDoc_STRVAR(unicodedata_category__doc__,
247"category(unichr)\n\
248\n\
249Returns the general category assigned to the Unicode character\n\
250unichr as string.");
251
252static PyObject *
253unicodedata_category(PyObject *self, PyObject *args)
254{
255    PyUnicodeObject *v;
256    int index;
257    Py_UCS4 c;
258
259    if (!PyArg_ParseTuple(args, "O!:category",
260			  &PyUnicode_Type, &v))
261	return NULL;
262    c = getuchar(v);
263    if (c == (Py_UCS4)-1)
264        return NULL;
265    index = (int) _getrecord_ex(c)->category;
266    if (self) {
267        const change_record *old = get_old_record(self, c);
268        if (old->category_changed != 0xFF)
269            index = old->category_changed;
270    }
271    return PyString_FromString(_PyUnicode_CategoryNames[index]);
272}
273
274PyDoc_STRVAR(unicodedata_bidirectional__doc__,
275"bidirectional(unichr)\n\
276\n\
277Returns the bidirectional category assigned to the Unicode character\n\
278unichr as string. If no such value is defined, an empty string is\n\
279returned.");
280
281static PyObject *
282unicodedata_bidirectional(PyObject *self, PyObject *args)
283{
284    PyUnicodeObject *v;
285    int index;
286    Py_UCS4 c;
287
288    if (!PyArg_ParseTuple(args, "O!:bidirectional",
289			  &PyUnicode_Type, &v))
290	return NULL;
291    c = getuchar(v);
292    if (c == (Py_UCS4)-1)
293        return NULL;
294    index = (int) _getrecord_ex(c)->bidirectional;
295    if (self) {
296        const change_record *old = get_old_record(self, c);
297        if (old->category_changed == 0)
298            index = 0; /* unassigned */
299        else if (old->bidir_changed != 0xFF)
300            index = old->bidir_changed;
301    }
302    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
303}
304
305PyDoc_STRVAR(unicodedata_combining__doc__,
306"combining(unichr)\n\
307\n\
308Returns the canonical combining class assigned to the Unicode\n\
309character unichr as integer. Returns 0 if no combining class is\n\
310defined.");
311
312static PyObject *
313unicodedata_combining(PyObject *self, PyObject *args)
314{
315    PyUnicodeObject *v;
316    int index;
317    Py_UCS4 c;
318
319    if (!PyArg_ParseTuple(args, "O!:combining",
320			  &PyUnicode_Type, &v))
321	return NULL;
322    c = getuchar(v);
323    if (c == (Py_UCS4)-1)
324        return NULL;
325    index = (int) _getrecord_ex(c)->combining;
326    if (self) {
327        const change_record *old = get_old_record(self, c);
328        if (old->category_changed == 0)
329            index = 0; /* unassigned */
330    }
331    return PyInt_FromLong(index);
332}
333
334PyDoc_STRVAR(unicodedata_mirrored__doc__,
335"mirrored(unichr)\n\
336\n\
337Returns the mirrored property assigned to the Unicode character\n\
338unichr as integer. Returns 1 if the character has been identified as\n\
339a \"mirrored\" character in bidirectional text, 0 otherwise.");
340
341static PyObject *
342unicodedata_mirrored(PyObject *self, PyObject *args)
343{
344    PyUnicodeObject *v;
345    int index;
346    Py_UCS4 c;
347
348    if (!PyArg_ParseTuple(args, "O!:mirrored",
349			  &PyUnicode_Type, &v))
350	return NULL;
351    c = getuchar(v);
352    if (c == (Py_UCS4)-1)
353        return NULL;
354    index = (int) _getrecord_ex(c)->mirrored;
355    if (self) {
356        const change_record *old = get_old_record(self, c);
357        if (old->category_changed == 0)
358            index = 0; /* unassigned */
359        else if (old->mirrored_changed != 0xFF)
360            index = old->mirrored_changed;
361    }
362    return PyInt_FromLong(index);
363}
364
365PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
366"east_asian_width(unichr)\n\
367\n\
368Returns the east asian width assigned to the Unicode character\n\
369unichr as string.");
370
371static PyObject *
372unicodedata_east_asian_width(PyObject *self, PyObject *args)
373{
374    PyUnicodeObject *v;
375    int index;
376    Py_UCS4 c;
377
378    if (!PyArg_ParseTuple(args, "O!:east_asian_width",
379			  &PyUnicode_Type, &v))
380	return NULL;
381    c = getuchar(v);
382    if (c == (Py_UCS4)-1)
383        return NULL;
384    index = (int) _getrecord_ex(c)->east_asian_width;
385    if (self) {
386        const change_record *old = get_old_record(self, c);
387        if (old->category_changed == 0)
388            index = 0; /* unassigned */
389    }
390    return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
391}
392
393PyDoc_STRVAR(unicodedata_decomposition__doc__,
394"decomposition(unichr)\n\
395\n\
396Returns the character decomposition mapping assigned to the Unicode\n\
397character unichr as string. An empty string is returned in case no\n\
398such mapping is defined.");
399
400static PyObject *
401unicodedata_decomposition(PyObject *self, PyObject *args)
402{
403    PyUnicodeObject *v;
404    char decomp[256];
405    int code, index, count, i;
406    unsigned int prefix_index;
407    Py_UCS4 c;
408
409    if (!PyArg_ParseTuple(args, "O!:decomposition",
410			  &PyUnicode_Type, &v))
411	return NULL;
412    c = getuchar(v);
413    if (c == (Py_UCS4)-1)
414        return NULL;
415
416    code = (int)c;
417
418    if (self) {
419        const change_record *old = get_old_record(self, c);
420        if (old->category_changed == 0)
421            return PyString_FromString(""); /* unassigned */
422    }
423
424    if (code < 0 || code >= 0x110000)
425        index = 0;
426    else {
427        index = decomp_index1[(code>>DECOMP_SHIFT)];
428        index = decomp_index2[(index<<DECOMP_SHIFT)+
429                             (code&((1<<DECOMP_SHIFT)-1))];
430    }
431
432    /* high byte is number of hex bytes (usually one or two), low byte
433       is prefix code (from*/
434    count = decomp_data[index] >> 8;
435
436    /* XXX: could allocate the PyString up front instead
437       (strlen(prefix) + 5 * count + 1 bytes) */
438
439    /* Based on how index is calculated above and decomp_data is generated
440       from Tools/unicode/makeunicodedata.py, it should not be possible
441       to overflow decomp_prefix. */
442    prefix_index = decomp_data[index] & 255;
443    assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
444
445    /* copy prefix */
446    i = strlen(decomp_prefix[prefix_index]);
447    memcpy(decomp, decomp_prefix[prefix_index], i);
448
449    while (count-- > 0) {
450        if (i)
451            decomp[i++] = ' ';
452        assert((size_t)i < sizeof(decomp));
453        PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
454                      decomp_data[++index]);
455        i += strlen(decomp + i);
456    }
457
458    decomp[i] = '\0';
459
460    return PyString_FromString(decomp);
461}
462
463static void
464get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
465{
466    if (code >= 0x110000) {
467        *index = 0;
468    } else if (self && get_old_record(self, code)->category_changed==0) {
469        /* unassigned in old version */
470        *index = 0;
471    }
472    else {
473        *index = decomp_index1[(code>>DECOMP_SHIFT)];
474        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
475                               (code&((1<<DECOMP_SHIFT)-1))];
476    }
477
478    /* high byte is number of hex bytes (usually one or two), low byte
479       is prefix code (from*/
480    *count = decomp_data[*index] >> 8;
481    *prefix = decomp_data[*index] & 255;
482
483    (*index)++;
484}
485
486#define SBase   0xAC00
487#define LBase   0x1100
488#define VBase   0x1161
489#define TBase   0x11A7
490#define LCount  19
491#define VCount  21
492#define TCount  28
493#define NCount  (VCount*TCount)
494#define SCount  (LCount*NCount)
495
496static PyObject*
497nfd_nfkd(PyObject *self, PyObject *input, int k)
498{
499    PyObject *result;
500    Py_UNICODE *i, *end, *o;
501    /* Longest decomposition in Unicode 3.2: U+FDFA */
502    Py_UNICODE stack[20];
503    Py_ssize_t space, isize;
504    int index, prefix, count, stackptr;
505    unsigned char prev, cur;
506
507    stackptr = 0;
508    isize = PyUnicode_GET_SIZE(input);
509    /* Overallocate atmost 10 characters. */
510    space = (isize > 10 ? 10 : isize) + isize;
511    result = PyUnicode_FromUnicode(NULL, space);
512    if (!result)
513        return NULL;
514    i = PyUnicode_AS_UNICODE(input);
515    end = i + isize;
516    o = PyUnicode_AS_UNICODE(result);
517
518    while (i < end) {
519        stack[stackptr++] = *i++;
520        while(stackptr) {
521            Py_UNICODE code = stack[--stackptr];
522            /* Hangul Decomposition adds three characters in
523               a single step, so we need atleast that much room. */
524            if (space < 3) {
525                Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
526                space += 10;
527                if (PyUnicode_Resize(&result, newsize) == -1)
528                    return NULL;
529                o = PyUnicode_AS_UNICODE(result) + newsize - space;
530            }
531            /* Hangul Decomposition. */
532            if (SBase <= code && code < (SBase+SCount)) {
533                int SIndex = code - SBase;
534                int L = LBase + SIndex / NCount;
535                int V = VBase + (SIndex % NCount) / TCount;
536                int T = TBase + SIndex % TCount;
537                *o++ = L;
538                *o++ = V;
539                space -= 2;
540                if (T != TBase) {
541                    *o++ = T;
542                    space --;
543                }
544                continue;
545            }
546            /* normalization changes */
547            if (self) {
548                Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
549                if (value != 0) {
550                    stack[stackptr++] = value;
551                    continue;
552                }
553            }
554
555            /* Other decompositions. */
556            get_decomp_record(self, code, &index, &prefix, &count);
557
558            /* Copy character if it is not decomposable, or has a
559               compatibility decomposition, but we do NFD. */
560            if (!count || (prefix && !k)) {
561                *o++ = code;
562                space--;
563                continue;
564            }
565            /* Copy decomposition onto the stack, in reverse
566               order.  */
567            while(count) {
568                code = decomp_data[index + (--count)];
569                stack[stackptr++] = code;
570            }
571        }
572    }
573
574    /* Drop overallocation. Cannot fail. */
575    PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
576
577    /* Sort canonically. */
578    i = PyUnicode_AS_UNICODE(result);
579    prev = _getrecord_ex(*i)->combining;
580    end = i + PyUnicode_GET_SIZE(result);
581    for (i++; i < end; i++) {
582        cur = _getrecord_ex(*i)->combining;
583        if (prev == 0 || cur == 0 || prev <= cur) {
584            prev = cur;
585            continue;
586        }
587        /* Non-canonical order. Need to switch *i with previous. */
588        o = i - 1;
589        while (1) {
590            Py_UNICODE tmp = o[1];
591            o[1] = o[0];
592            o[0] = tmp;
593            o--;
594            if (o < PyUnicode_AS_UNICODE(result))
595                break;
596            prev = _getrecord_ex(*o)->combining;
597            if (prev == 0 || prev <= cur)
598                break;
599        }
600        prev = _getrecord_ex(*i)->combining;
601    }
602    return result;
603}
604
605static int
606find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
607{
608    int index;
609    for (index = 0; nfc[index].start; index++) {
610        int start = nfc[index].start;
611        if (code < start)
612            return -1;
613        if (code <= start + nfc[index].count) {
614            int delta = code - start;
615            return nfc[index].index + delta;
616        }
617    }
618    return -1;
619}
620
621static PyObject*
622nfc_nfkc(PyObject *self, PyObject *input, int k)
623{
624    PyObject *result;
625    Py_UNICODE *i, *i1, *o, *end;
626    int f,l,index,index1,comb;
627    Py_UNICODE code;
628    Py_UNICODE *skipped[20];
629    int cskipped = 0;
630
631    result = nfd_nfkd(self, input, k);
632    if (!result)
633        return NULL;
634
635    /* We are going to modify result in-place.
636       If nfd_nfkd is changed to sometimes return the input,
637       this code needs to be reviewed. */
638    assert(result != input);
639
640    i = PyUnicode_AS_UNICODE(result);
641    end = i + PyUnicode_GET_SIZE(result);
642    o = PyUnicode_AS_UNICODE(result);
643
644  again:
645    while (i < end) {
646      for (index = 0; index < cskipped; index++) {
647          if (skipped[index] == i) {
648              /* *i character is skipped.
649                 Remove from list. */
650              skipped[index] = skipped[cskipped-1];
651              cskipped--;
652              i++;
653              goto again; /* continue while */
654          }
655      }
656      /* Hangul Composition. We don't need to check for <LV,T>
657         pairs, since we always have decomposed data. */
658      if (LBase <= *i && *i < (LBase+LCount) &&
659          i + 1 < end &&
660          VBase <= i[1] && i[1] <= (VBase+VCount)) {
661          int LIndex, VIndex;
662          LIndex = i[0] - LBase;
663          VIndex = i[1] - VBase;
664          code = SBase + (LIndex*VCount+VIndex)*TCount;
665          i+=2;
666          if (i < end &&
667              TBase <= *i && *i <= (TBase+TCount)) {
668              code += *i-TBase;
669              i++;
670          }
671          *o++ = code;
672          continue;
673      }
674
675      f = find_nfc_index(self, nfc_first, *i);
676      if (f == -1) {
677          *o++ = *i++;
678          continue;
679      }
680      /* Find next unblocked character. */
681      i1 = i+1;
682      comb = 0;
683      while (i1 < end) {
684          int comb1 = _getrecord_ex(*i1)->combining;
685          if (comb1 && comb == comb1) {
686              /* Character is blocked. */
687              i1++;
688              continue;
689          }
690          l = find_nfc_index(self, nfc_last, *i1);
691          /* *i1 cannot be combined with *i. If *i1
692             is a starter, we don't need to look further.
693             Otherwise, record the combining class. */
694          if (l == -1) {
695            not_combinable:
696              if (comb1 == 0)
697                  break;
698              comb = comb1;
699              i1++;
700              continue;
701          }
702          index = f*TOTAL_LAST + l;
703          index1 = comp_index[index >> COMP_SHIFT];
704          code = comp_data[(index1<<COMP_SHIFT)+
705                           (index&((1<<COMP_SHIFT)-1))];
706          if (code == 0)
707              goto not_combinable;
708
709          /* Replace the original character. */
710          *i = code;
711          /* Mark the second character unused. */
712          skipped[cskipped++] = i1;
713          i1++;
714          f = find_nfc_index(self, nfc_first, *i);
715          if (f == -1)
716              break;
717      }
718      *o++ = *i++;
719    }
720    if (o != end)
721        PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
722    return result;
723}
724
725/* Return 1 if the input is certainly normalized, 0 if it might not be. */
726static int
727is_normalized(PyObject *self, PyObject *input, int nfc, int k)
728{
729    Py_UNICODE *i, *end;
730    unsigned char prev_combining = 0, quickcheck_mask;
731
732    /* An older version of the database is requested, quickchecks must be
733       disabled. */
734    if (self != NULL)
735        return 0;
736
737    /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
738       as described in http://unicode.org/reports/tr15/#Annex8. */
739    quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
740
741    i = PyUnicode_AS_UNICODE(input);
742    end = i + PyUnicode_GET_SIZE(input);
743    while (i < end) {
744        const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
745        unsigned char combining = record->combining;
746        unsigned char quickcheck = record->normalization_quick_check;
747
748        if (quickcheck & quickcheck_mask)
749            return 0; /* this string might need normalization */
750        if (combining && prev_combining > combining)
751            return 0; /* non-canonical sort order, not normalized */
752        prev_combining = combining;
753    }
754    return 1; /* certainly normalized */
755}
756
757PyDoc_STRVAR(unicodedata_normalize__doc__,
758"normalize(form, unistr)\n\
759\n\
760Return the normal form 'form' for the Unicode string unistr.  Valid\n\
761values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
762
763static PyObject*
764unicodedata_normalize(PyObject *self, PyObject *args)
765{
766    char *form;
767    PyObject *input;
768
769    if(!PyArg_ParseTuple(args, "sO!:normalize",
770                         &form, &PyUnicode_Type, &input))
771        return NULL;
772
773    if (PyUnicode_GetSize(input) == 0) {
774        /* Special case empty input strings, since resizing
775           them  later would cause internal errors. */
776        Py_INCREF(input);
777        return input;
778    }
779
780    if (strcmp(form, "NFC") == 0) {
781        if (is_normalized(self, input, 1, 0)) {
782            Py_INCREF(input);
783            return input;
784        }
785        return nfc_nfkc(self, input, 0);
786    }
787    if (strcmp(form, "NFKC") == 0) {
788        if (is_normalized(self, input, 1, 1)) {
789            Py_INCREF(input);
790            return input;
791        }
792        return nfc_nfkc(self, input, 1);
793    }
794    if (strcmp(form, "NFD") == 0) {
795        if (is_normalized(self, input, 0, 0)) {
796            Py_INCREF(input);
797            return input;
798        }
799        return nfd_nfkd(self, input, 0);
800    }
801    if (strcmp(form, "NFKD") == 0) {
802        if (is_normalized(self, input, 0, 1)) {
803            Py_INCREF(input);
804            return input;
805        }
806        return nfd_nfkd(self, input, 1);
807    }
808    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
809    return NULL;
810}
811
812/* -------------------------------------------------------------------- */
813/* unicode character name tables */
814
815/* data file generated by Tools/unicode/makeunicodedata.py */
816#include "unicodename_db.h"
817
818/* -------------------------------------------------------------------- */
819/* database code (cut and pasted from the unidb package) */
820
821static unsigned long
822_gethash(const char *s, int len, int scale)
823{
824    int i;
825    unsigned long h = 0;
826    unsigned long ix;
827    for (i = 0; i < len; i++) {
828        h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
829        ix = h & 0xff000000;
830        if (ix)
831            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
832    }
833    return h;
834}
835
836static char *hangul_syllables[][3] = {
837    { "G",  "A",   ""   },
838    { "GG", "AE",  "G"  },
839    { "N",  "YA",  "GG" },
840    { "D",  "YAE", "GS" },
841    { "DD", "EO",  "N", },
842    { "R",  "E",   "NJ" },
843    { "M",  "YEO", "NH" },
844    { "B",  "YE",  "D"  },
845    { "BB", "O",   "L"  },
846    { "S",  "WA",  "LG" },
847    { "SS", "WAE", "LM" },
848    { "",   "OE",  "LB" },
849    { "J",  "YO",  "LS" },
850    { "JJ", "U",   "LT" },
851    { "C",  "WEO", "LP" },
852    { "K",  "WE",  "LH" },
853    { "T",  "WI",  "M"  },
854    { "P",  "YU",  "B"  },
855    { "H",  "EU",  "BS" },
856    { 0,    "YI",  "S"  },
857    { 0,    "I",   "SS" },
858    { 0,    0,     "NG" },
859    { 0,    0,     "J"  },
860    { 0,    0,     "C"  },
861    { 0,    0,     "K"  },
862    { 0,    0,     "T"  },
863    { 0,    0,     "P"  },
864    { 0,    0,     "H"  }
865};
866
867static int
868is_unified_ideograph(Py_UCS4 code)
869{
870    return (
871        (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
872        (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
873        (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
874}
875
876static int
877_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
878{
879    int offset;
880    int i;
881    int word;
882    unsigned char* w;
883
884    if (code >= 0x110000)
885        return 0;
886
887    if (self) {
888        const change_record *old = get_old_record(self, code);
889        if (old->category_changed == 0) {
890            /* unassigned */
891            return 0;
892        }
893    }
894
895    if (SBase <= code && code < SBase+SCount) {
896	/* Hangul syllable. */
897	int SIndex = code - SBase;
898	int L = SIndex / NCount;
899	int V = (SIndex % NCount) / TCount;
900	int T = SIndex % TCount;
901
902	if (buflen < 27)
903	    /* Worst case: HANGUL SYLLABLE <10chars>. */
904	    return 0;
905	strcpy(buffer, "HANGUL SYLLABLE ");
906	buffer += 16;
907	strcpy(buffer, hangul_syllables[L][0]);
908	buffer += strlen(hangul_syllables[L][0]);
909	strcpy(buffer, hangul_syllables[V][1]);
910	buffer += strlen(hangul_syllables[V][1]);
911	strcpy(buffer, hangul_syllables[T][2]);
912	buffer += strlen(hangul_syllables[T][2]);
913	*buffer = '\0';
914	return 1;
915    }
916
917    if (is_unified_ideograph(code)) {
918        if (buflen < 28)
919            /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
920            return 0;
921        sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
922        return 1;
923    }
924
925    /* get offset into phrasebook */
926    offset = phrasebook_offset1[(code>>phrasebook_shift)];
927    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
928                               (code&((1<<phrasebook_shift)-1))];
929    if (!offset)
930        return 0;
931
932    i = 0;
933
934    for (;;) {
935        /* get word index */
936        word = phrasebook[offset] - phrasebook_short;
937        if (word >= 0) {
938            word = (word << 8) + phrasebook[offset+1];
939            offset += 2;
940        } else
941            word = phrasebook[offset++];
942        if (i) {
943            if (i > buflen)
944                return 0; /* buffer overflow */
945            buffer[i++] = ' ';
946        }
947        /* copy word string from lexicon.  the last character in the
948           word has bit 7 set.  the last word in a string ends with
949           0x80 */
950        w = lexicon + lexicon_offset[word];
951        while (*w < 128) {
952            if (i >= buflen)
953                return 0; /* buffer overflow */
954            buffer[i++] = *w++;
955        }
956        if (i >= buflen)
957            return 0; /* buffer overflow */
958        buffer[i++] = *w & 127;
959        if (*w == 128)
960            break; /* end of word */
961    }
962
963    return 1;
964}
965
966static int
967_cmpname(PyObject *self, int code, const char* name, int namelen)
968{
969    /* check if code corresponds to the given name */
970    int i;
971    char buffer[NAME_MAXLEN];
972    if (!_getucname(self, code, buffer, sizeof(buffer)))
973        return 0;
974    for (i = 0; i < namelen; i++) {
975        if (toupper(Py_CHARMASK(name[i])) != buffer[i])
976            return 0;
977    }
978    return buffer[namelen] == '\0';
979}
980
981static void
982find_syllable(const char *str, int *len, int *pos, int count, int column)
983{
984    int i, len1;
985    *len = -1;
986    for (i = 0; i < count; i++) {
987	char *s = hangul_syllables[i][column];
988	len1 = strlen(s);
989	if (len1 <= *len)
990	    continue;
991	if (strncmp(str, s, len1) == 0) {
992	    *len = len1;
993	    *pos = i;
994	}
995    }
996    if (*len == -1) {
997	*len = 0;
998    }
999}
1000
1001static int
1002_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
1003{
1004    unsigned int h, v;
1005    unsigned int mask = code_size-1;
1006    unsigned int i, incr;
1007
1008    /* Check for hangul syllables. */
1009    if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1010	int len, L = -1, V = -1, T = -1;
1011	const char *pos = name + 16;
1012	find_syllable(pos, &len, &L, LCount, 0);
1013	pos += len;
1014	find_syllable(pos, &len, &V, VCount, 1);
1015	pos += len;
1016	find_syllable(pos, &len, &T, TCount, 2);
1017	pos += len;
1018	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1019	    *code = SBase + (L*VCount+V)*TCount + T;
1020	    return 1;
1021	}
1022        /* Otherwise, it's an illegal syllable name. */
1023        return 0;
1024    }
1025
1026    /* Check for unified ideographs. */
1027    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1028        /* Four or five hexdigits must follow. */
1029        v = 0;
1030        name += 22;
1031        namelen -= 22;
1032        if (namelen != 4 && namelen != 5)
1033            return 0;
1034        while (namelen--) {
1035            v *= 16;
1036            if (*name >= '0' && *name <= '9')
1037                v += *name - '0';
1038            else if (*name >= 'A' && *name <= 'F')
1039                v += *name - 'A' + 10;
1040            else
1041                return 0;
1042            name++;
1043        }
1044        if (!is_unified_ideograph(v))
1045            return 0;
1046        *code = v;
1047        return 1;
1048    }
1049
1050    /* the following is the same as python's dictionary lookup, with
1051       only minor changes.  see the makeunicodedata script for more
1052       details */
1053
1054    h = (unsigned int) _gethash(name, namelen, code_magic);
1055    i = (~h) & mask;
1056    v = code_hash[i];
1057    if (!v)
1058        return 0;
1059    if (_cmpname(self, v, name, namelen)) {
1060        *code = v;
1061        return 1;
1062    }
1063    incr = (h ^ (h >> 3)) & mask;
1064    if (!incr)
1065        incr = mask;
1066    for (;;) {
1067        i = (i + incr) & mask;
1068        v = code_hash[i];
1069        if (!v)
1070            return 0;
1071        if (_cmpname(self, v, name, namelen)) {
1072            *code = v;
1073            return 1;
1074        }
1075        incr = incr << 1;
1076        if (incr > mask)
1077            incr = incr ^ code_poly;
1078    }
1079}
1080
1081static const _PyUnicode_Name_CAPI hashAPI =
1082{
1083    sizeof(_PyUnicode_Name_CAPI),
1084    _getucname,
1085    _getcode
1086};
1087
1088/* -------------------------------------------------------------------- */
1089/* Python bindings */
1090
1091PyDoc_STRVAR(unicodedata_name__doc__,
1092"name(unichr[, default])\n\
1093Returns the name assigned to the Unicode character unichr as a\n\
1094string. If no name is defined, default is returned, or, if not\n\
1095given, ValueError is raised.");
1096
1097static PyObject *
1098unicodedata_name(PyObject* self, PyObject* args)
1099{
1100    char name[NAME_MAXLEN];
1101    Py_UCS4 c;
1102
1103    PyUnicodeObject* v;
1104    PyObject* defobj = NULL;
1105    if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1106        return NULL;
1107
1108    c = getuchar(v);
1109    if (c == (Py_UCS4)-1)
1110        return NULL;
1111
1112    if (!_getucname(self, c, name, sizeof(name))) {
1113	if (defobj == NULL) {
1114	    PyErr_SetString(PyExc_ValueError, "no such name");
1115            return NULL;
1116	}
1117	else {
1118	    Py_INCREF(defobj);
1119	    return defobj;
1120	}
1121    }
1122
1123    return Py_BuildValue("s", name);
1124}
1125
1126PyDoc_STRVAR(unicodedata_lookup__doc__,
1127"lookup(name)\n\
1128\n\
1129Look up character by name.  If a character with the\n\
1130given name is found, return the corresponding Unicode\n\
1131character.  If not found, KeyError is raised.");
1132
1133static PyObject *
1134unicodedata_lookup(PyObject* self, PyObject* args)
1135{
1136    Py_UCS4 code;
1137    Py_UNICODE str[2];
1138
1139    char* name;
1140    int namelen;
1141    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1142        return NULL;
1143
1144    if (!_getcode(self, name, namelen, &code)) {
1145        PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1146                     name);
1147        return NULL;
1148    }
1149
1150#ifndef Py_UNICODE_WIDE
1151    if (code >= 0x10000) {
1152        str[0] = 0xd800 + ((code - 0x10000) >> 10);
1153        str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1154        return PyUnicode_FromUnicode(str, 2);
1155    }
1156#endif
1157    str[0] = (Py_UNICODE) code;
1158    return PyUnicode_FromUnicode(str, 1);
1159}
1160
1161/* XXX Add doc strings. */
1162
1163static PyMethodDef unicodedata_functions[] = {
1164    {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1165    {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1166    {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1167    {"category", unicodedata_category, METH_VARARGS,
1168                 unicodedata_category__doc__},
1169    {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1170                      unicodedata_bidirectional__doc__},
1171    {"combining", unicodedata_combining, METH_VARARGS,
1172                  unicodedata_combining__doc__},
1173    {"mirrored", unicodedata_mirrored, METH_VARARGS,
1174                 unicodedata_mirrored__doc__},
1175    {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1176                         unicodedata_east_asian_width__doc__},
1177    {"decomposition", unicodedata_decomposition, METH_VARARGS,
1178                      unicodedata_decomposition__doc__},
1179    {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1180    {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1181    {"normalize", unicodedata_normalize, METH_VARARGS,
1182                  unicodedata_normalize__doc__},
1183    {NULL, NULL}		/* sentinel */
1184};
1185
1186static PyTypeObject UCD_Type = {
1187	/* The ob_type field must be initialized in the module init function
1188	 * to be portable to Windows without using C++. */
1189	PyVarObject_HEAD_INIT(NULL, 0)
1190	"unicodedata.UCD",		/*tp_name*/
1191	sizeof(PreviousDBVersion),	/*tp_basicsize*/
1192	0,			/*tp_itemsize*/
1193	/* methods */
1194	(destructor)PyObject_Del, /*tp_dealloc*/
1195	0,			/*tp_print*/
1196	0,                      /*tp_getattr*/
1197	0,			/*tp_setattr*/
1198	0,			/*tp_compare*/
1199	0,			/*tp_repr*/
1200	0,			/*tp_as_number*/
1201	0,			/*tp_as_sequence*/
1202	0,			/*tp_as_mapping*/
1203	0,			/*tp_hash*/
1204        0,                      /*tp_call*/
1205        0,                      /*tp_str*/
1206        PyObject_GenericGetAttr,/*tp_getattro*/
1207        0,                      /*tp_setattro*/
1208        0,                      /*tp_as_buffer*/
1209        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1210        0,                      /*tp_doc*/
1211        0,                      /*tp_traverse*/
1212        0,                      /*tp_clear*/
1213        0,                      /*tp_richcompare*/
1214        0,                      /*tp_weaklistoffset*/
1215        0,                      /*tp_iter*/
1216        0,                      /*tp_iternext*/
1217        unicodedata_functions,  /*tp_methods*/
1218        DB_members,             /*tp_members*/
1219        0,                      /*tp_getset*/
1220        0,                      /*tp_base*/
1221        0,                      /*tp_dict*/
1222        0,                      /*tp_descr_get*/
1223        0,                      /*tp_descr_set*/
1224        0,                      /*tp_dictoffset*/
1225        0,                      /*tp_init*/
1226        0,                      /*tp_alloc*/
1227        0,                      /*tp_new*/
1228        0,                      /*tp_free*/
1229        0,                      /*tp_is_gc*/
1230};
1231
1232PyDoc_STRVAR(unicodedata_docstring,
1233"This module provides access to the Unicode Character Database which\n\
1234defines character properties for all Unicode characters. The data in\n\
1235this database is based on the UnicodeData.txt file version\n\
12365.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1237\n\
1238The module uses the same names and symbols as defined by the\n\
1239UnicodeData File Format 5.1.0 (see\n\
1240http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1241
1242PyMODINIT_FUNC
1243initunicodedata(void)
1244{
1245    PyObject *m, *v;
1246
1247    Py_TYPE(&UCD_Type) = &PyType_Type;
1248
1249    m = Py_InitModule3(
1250        "unicodedata", unicodedata_functions, unicodedata_docstring);
1251    if (!m)
1252        return;
1253
1254    PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1255    Py_INCREF(&UCD_Type);
1256    PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1257
1258    /* Previous versions */
1259    v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1260    if (v != NULL)
1261        PyModule_AddObject(m, "ucd_3_2_0", v);
1262
1263    /* Export C API */
1264    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1265    if (v != NULL)
1266        PyModule_AddObject(m, "ucnhash_CAPI", v);
1267}
1268
1269/*
1270Local variables:
1271c-basic-offset: 4
1272indent-tabs-mode: nil
1273End:
1274*/
1275