unicodedata.c revision 6fc2382883516434c5876e768fabd75a0c633d84
1/* ------------------------------------------------------------------------
2
3   unicodedata -- Provides access to the Unicode 4.1 data base.
4
5   Data was extracted from the Unicode 4.1 UnicodeData.txt file.
6
7   Written by Marc-Andre Lemburg (mal@lemburg.com).
8   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9   Modified by Martin v. L�wis (martin@v.loewis.de)
10
11   Copyright (c) Corporation for National Research Initiatives.
12
13   ------------------------------------------------------------------------ */
14
15#include "Python.h"
16#include "ucnhash.h"
17#include "structmember.h"
18
19/* character properties */
20
21typedef struct {
22    const unsigned char category;	/* index into
23					   _PyUnicode_CategoryNames */
24    const unsigned char	combining; 	/* combining class value 0 - 255 */
25    const unsigned char	bidirectional; 	/* index into
26					   _PyUnicode_BidirectionalNames */
27    const unsigned char mirrored;	/* true if mirrored in bidir mode */
28    const unsigned char east_asian_width;	/* index into
29						   _PyUnicode_EastAsianWidth */
30} _PyUnicode_DatabaseRecord;
31
32typedef struct change_record {
33    /* sequence of fields should be the same as in merge_old_version */
34    const unsigned char bidir_changed;
35    const unsigned char category_changed;
36    const unsigned char decimal_changed;
37    const int numeric_changed;
38} change_record;
39
40/* data file generated by Tools/unicode/makeunicodedata.py */
41#include "unicodedata_db.h"
42
43static const _PyUnicode_DatabaseRecord*
44_getrecord_ex(Py_UCS4 code)
45{
46    int index;
47    if (code >= 0x110000)
48        index = 0;
49    else {
50        index = index1[(code>>SHIFT)];
51        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
52    }
53
54    return &_PyUnicode_Database_Records[index];
55}
56
57static const _PyUnicode_DatabaseRecord*
58_getrecord(PyUnicodeObject* v)
59{
60    return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
61}
62
63/* ------------- Previous-version API ------------------------------------- */
64typedef struct previous_version {
65    PyObject_HEAD
66    const char *name;
67    const change_record* (*getrecord)(Py_UCS4);
68    Py_UCS4 (*normalization)(Py_UCS4);
69} PreviousDBVersion;
70
71#define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
72
73static PyMemberDef DB_members[] = {
74	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
75        {NULL}
76};
77
78/* forward declaration */
79static PyTypeObject UCD_Type;
80
81static PyObject*
82new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
83                     Py_UCS4 (*normalization)(Py_UCS4))
84{
85	PreviousDBVersion *self;
86	self = PyObject_New(PreviousDBVersion, &UCD_Type);
87	if (self == NULL)
88		return NULL;
89	self->name = name;
90	self->getrecord = getrecord;
91        self->normalization = normalization;
92	return (PyObject*)self;
93}
94
95/* --- Module API --------------------------------------------------------- */
96
97PyDoc_STRVAR(unicodedata_decimal__doc__,
98"decimal(unichr[, default])\n\
99\n\
100Returns the decimal value assigned to the Unicode character unichr\n\
101as integer. If no such value is defined, default is returned, or, if\n\
102not given, ValueError is raised.");
103
104static PyObject *
105unicodedata_decimal(PyObject *self, PyObject *args)
106{
107    PyUnicodeObject *v;
108    PyObject *defobj = NULL;
109    int have_old = 0;
110    long rc;
111
112    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
113        return NULL;
114    if (PyUnicode_GET_SIZE(v) != 1) {
115	PyErr_SetString(PyExc_TypeError,
116			"need a single Unicode character as parameter");
117        return NULL;
118    }
119
120    if (self) {
121        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
122        if (old->category_changed == 0) {
123            /* unassigned */
124            have_old = 1;
125            rc = -1;
126        }
127        else if (old->decimal_changed != 0xFF) {
128            have_old = 1;
129            rc = old->decimal_changed;
130        }
131    }
132
133    if (!have_old)
134        rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
135    if (rc < 0) {
136	if (defobj == NULL) {
137	    PyErr_SetString(PyExc_ValueError,
138			    "not a decimal");
139            return NULL;
140	}
141	else {
142	    Py_INCREF(defobj);
143	    return defobj;
144	}
145    }
146    return PyInt_FromLong(rc);
147}
148
149PyDoc_STRVAR(unicodedata_digit__doc__,
150"digit(unichr[, default])\n\
151\n\
152Returns the digit value assigned to the Unicode character unichr as\n\
153integer. If no such value is defined, default is returned, or, if\n\
154not given, ValueError is raised.");
155
156static PyObject *
157unicodedata_digit(PyObject *self, PyObject *args)
158{
159    PyUnicodeObject *v;
160    PyObject *defobj = NULL;
161    long rc;
162
163    if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
164        return NULL;
165    if (PyUnicode_GET_SIZE(v) != 1) {
166	PyErr_SetString(PyExc_TypeError,
167			"need a single Unicode character as parameter");
168        return NULL;
169    }
170    rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
171    if (rc < 0) {
172	if (defobj == NULL) {
173	    PyErr_SetString(PyExc_ValueError, "not a digit");
174            return NULL;
175	}
176	else {
177	    Py_INCREF(defobj);
178	    return defobj;
179	}
180    }
181    return PyInt_FromLong(rc);
182}
183
184PyDoc_STRVAR(unicodedata_numeric__doc__,
185"numeric(unichr[, default])\n\
186\n\
187Returns the numeric value assigned to the Unicode character unichr\n\
188as float. If no such value is defined, default is returned, or, if\n\
189not given, ValueError is raised.");
190
191static PyObject *
192unicodedata_numeric(PyObject *self, PyObject *args)
193{
194    PyUnicodeObject *v;
195    PyObject *defobj = NULL;
196    int have_old = 0;
197    double rc;
198
199    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
200        return NULL;
201    if (PyUnicode_GET_SIZE(v) != 1) {
202	PyErr_SetString(PyExc_TypeError,
203			"need a single Unicode character as parameter");
204	return NULL;
205    }
206
207    if (self) {
208        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
209        if (old->category_changed == 0) {
210            /* unassigned */
211            have_old = 1;
212            rc = -1.0;
213        }
214        else if (old->decimal_changed != 0xFF) {
215            have_old = 1;
216            rc = old->decimal_changed;
217        }
218    }
219
220    if (!have_old)
221        rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
222    if (rc == -1.0) {
223	if (defobj == NULL) {
224	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
225	    return NULL;
226	}
227	else {
228	    Py_INCREF(defobj);
229	    return defobj;
230	}
231    }
232    return PyFloat_FromDouble(rc);
233}
234
235PyDoc_STRVAR(unicodedata_category__doc__,
236"category(unichr)\n\
237\n\
238Returns the general category assigned to the Unicode character\n\
239unichr as string.");
240
241static PyObject *
242unicodedata_category(PyObject *self, PyObject *args)
243{
244    PyUnicodeObject *v;
245    int index;
246
247    if (!PyArg_ParseTuple(args, "O!:category",
248			  &PyUnicode_Type, &v))
249	return NULL;
250    if (PyUnicode_GET_SIZE(v) != 1) {
251	PyErr_SetString(PyExc_TypeError,
252			"need a single Unicode character as parameter");
253	return NULL;
254    }
255    index = (int) _getrecord(v)->category;
256    if (self) {
257        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
258        if (old->category_changed != 0xFF)
259            index = old->category_changed;
260    }
261    return PyString_FromString(_PyUnicode_CategoryNames[index]);
262}
263
264PyDoc_STRVAR(unicodedata_bidirectional__doc__,
265"bidirectional(unichr)\n\
266\n\
267Returns the bidirectional category assigned to the Unicode character\n\
268unichr as string. If no such value is defined, an empty string is\n\
269returned.");
270
271static PyObject *
272unicodedata_bidirectional(PyObject *self, PyObject *args)
273{
274    PyUnicodeObject *v;
275    int index;
276
277    if (!PyArg_ParseTuple(args, "O!:bidirectional",
278			  &PyUnicode_Type, &v))
279	return NULL;
280    if (PyUnicode_GET_SIZE(v) != 1) {
281	PyErr_SetString(PyExc_TypeError,
282			"need a single Unicode character as parameter");
283	return NULL;
284    }
285    index = (int) _getrecord(v)->bidirectional;
286    if (self) {
287        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
288        if (old->category_changed == 0)
289            index = 0; /* unassigned */
290        else if (old->bidir_changed != 0xFF)
291            index = old->bidir_changed;
292    }
293    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
294}
295
296PyDoc_STRVAR(unicodedata_combining__doc__,
297"combining(unichr)\n\
298\n\
299Returns the canonical combining class assigned to the Unicode\n\
300character unichr as integer. Returns 0 if no combining class is\n\
301defined.");
302
303static PyObject *
304unicodedata_combining(PyObject *self, PyObject *args)
305{
306    PyUnicodeObject *v;
307    int index;
308
309    if (!PyArg_ParseTuple(args, "O!:combining",
310			  &PyUnicode_Type, &v))
311	return NULL;
312    if (PyUnicode_GET_SIZE(v) != 1) {
313	PyErr_SetString(PyExc_TypeError,
314			"need a single Unicode character as parameter");
315	return NULL;
316    }
317    index = (int) _getrecord(v)->combining;
318    if (self) {
319        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
320        if (old->category_changed == 0)
321            index = 0; /* unassigned */
322    }
323    return PyInt_FromLong(index);
324}
325
326PyDoc_STRVAR(unicodedata_mirrored__doc__,
327"mirrored(unichr)\n\
328\n\
329Returns the mirrored property assigned to the Unicode character\n\
330unichr as integer. Returns 1 if the character has been identified as\n\
331a \"mirrored\" character in bidirectional text, 0 otherwise.");
332
333static PyObject *
334unicodedata_mirrored(PyObject *self, PyObject *args)
335{
336    PyUnicodeObject *v;
337    int index;
338
339    if (!PyArg_ParseTuple(args, "O!:mirrored",
340			  &PyUnicode_Type, &v))
341	return NULL;
342    if (PyUnicode_GET_SIZE(v) != 1) {
343	PyErr_SetString(PyExc_TypeError,
344			"need a single Unicode character as parameter");
345	return NULL;
346    }
347    index = (int) _getrecord(v)->mirrored;
348    if (self) {
349        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
350        if (old->category_changed == 0)
351            index = 0; /* unassigned */
352    }
353    return PyInt_FromLong(index);
354}
355
356PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
357"east_asian_width(unichr)\n\
358\n\
359Returns the east asian width assigned to the Unicode character\n\
360unichr as string.");
361
362static PyObject *
363unicodedata_east_asian_width(PyObject *self, PyObject *args)
364{
365    PyUnicodeObject *v;
366    int index;
367
368    if (!PyArg_ParseTuple(args, "O!:east_asian_width",
369			  &PyUnicode_Type, &v))
370	return NULL;
371    if (PyUnicode_GET_SIZE(v) != 1) {
372	PyErr_SetString(PyExc_TypeError,
373			"need a single Unicode character as parameter");
374	return NULL;
375    }
376    index = (int) _getrecord(v)->east_asian_width;
377    if (self) {
378        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
379        if (old->category_changed == 0)
380            index = 0; /* unassigned */
381    }
382    return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
383}
384
385PyDoc_STRVAR(unicodedata_decomposition__doc__,
386"decomposition(unichr)\n\
387\n\
388Returns the character decomposition mapping assigned to the Unicode\n\
389character unichr as string. An empty string is returned in case no\n\
390such mapping is defined.");
391
392static PyObject *
393unicodedata_decomposition(PyObject *self, PyObject *args)
394{
395    PyUnicodeObject *v;
396    char decomp[256];
397    int code, index, count, i;
398    unsigned int prefix_index;
399
400    if (!PyArg_ParseTuple(args, "O!:decomposition",
401			  &PyUnicode_Type, &v))
402	return NULL;
403    if (PyUnicode_GET_SIZE(v) != 1) {
404	PyErr_SetString(PyExc_TypeError,
405			"need a single Unicode character as parameter");
406	return NULL;
407    }
408
409    code = (int) *PyUnicode_AS_UNICODE(v);
410
411    if (self) {
412        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
413        if (old->category_changed == 0)
414            return PyString_FromString(""); /* unassigned */
415    }
416
417    if (code < 0 || code >= 0x110000)
418        index = 0;
419    else {
420        index = decomp_index1[(code>>DECOMP_SHIFT)];
421        index = decomp_index2[(index<<DECOMP_SHIFT)+
422                             (code&((1<<DECOMP_SHIFT)-1))];
423    }
424
425    /* high byte is number of hex bytes (usually one or two), low byte
426       is prefix code (from*/
427    count = decomp_data[index] >> 8;
428
429    /* XXX: could allocate the PyString up front instead
430       (strlen(prefix) + 5 * count + 1 bytes) */
431
432    /* Based on how index is calculated above and decomp_data is generated
433       from Tools/unicode/makeunicodedata.py, it should not be possible
434       to overflow decomp_prefix. */
435    prefix_index = decomp_data[index] & 255;
436    assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
437
438    /* copy prefix */
439    i = strlen(decomp_prefix[prefix_index]);
440    memcpy(decomp, decomp_prefix[prefix_index], i);
441
442    while (count-- > 0) {
443        if (i)
444            decomp[i++] = ' ';
445        assert((size_t)i < sizeof(decomp));
446        PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
447                      decomp_data[++index]);
448        i += strlen(decomp + i);
449    }
450
451    decomp[i] = '\0';
452
453    return PyString_FromString(decomp);
454}
455
456static void
457get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
458{
459    if (code >= 0x110000) {
460        *index = 0;
461    } else if (self && get_old_record(self, code)->category_changed==0) {
462        /* unassigned in old version */
463        *index = 0;
464    }
465    else {
466        *index = decomp_index1[(code>>DECOMP_SHIFT)];
467        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
468                               (code&((1<<DECOMP_SHIFT)-1))];
469    }
470
471    /* high byte is number of hex bytes (usually one or two), low byte
472       is prefix code (from*/
473    *count = decomp_data[*index] >> 8;
474    *prefix = decomp_data[*index] & 255;
475
476    (*index)++;
477}
478
479#define SBase   0xAC00
480#define LBase   0x1100
481#define VBase   0x1161
482#define TBase   0x11A7
483#define LCount  19
484#define VCount  21
485#define TCount  28
486#define NCount  (VCount*TCount)
487#define SCount  (LCount*NCount)
488
489static PyObject*
490nfd_nfkd(PyObject *self, PyObject *input, int k)
491{
492    PyObject *result;
493    Py_UNICODE *i, *end, *o;
494    /* Longest decomposition in Unicode 3.2: U+FDFA */
495    Py_UNICODE stack[20];
496    Py_ssize_t space, isize;
497    int index, prefix, count, stackptr;
498    unsigned char prev, cur;
499
500    stackptr = 0;
501    isize = PyUnicode_GET_SIZE(input);
502    /* Overallocate atmost 10 characters. */
503    space = (isize > 10 ? 10 : isize) + isize;
504    result = PyUnicode_FromUnicode(NULL, space);
505    if (!result)
506        return NULL;
507    i = PyUnicode_AS_UNICODE(input);
508    end = i + isize;
509    o = PyUnicode_AS_UNICODE(result);
510
511    while (i < end) {
512        stack[stackptr++] = *i++;
513        while(stackptr) {
514            Py_UNICODE code = stack[--stackptr];
515            /* Hangul Decomposition adds three characters in
516               a single step, so we need atleast that much room. */
517            if (space < 3) {
518                Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
519                space += 10;
520                if (PyUnicode_Resize(&result, newsize) == -1)
521                    return NULL;
522                o = PyUnicode_AS_UNICODE(result) + newsize - space;
523            }
524            /* Hangul Decomposition. */
525            if (SBase <= code && code < (SBase+SCount)) {
526                int SIndex = code - SBase;
527                int L = LBase + SIndex / NCount;
528                int V = VBase + (SIndex % NCount) / TCount;
529                int T = TBase + SIndex % TCount;
530                *o++ = L;
531                *o++ = V;
532                space -= 2;
533                if (T != TBase) {
534                    *o++ = T;
535                    space --;
536                }
537                continue;
538            }
539            /* normalization changes */
540            if (self) {
541                Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
542                if (value != 0) {
543                    stack[stackptr++] = value;
544                    continue;
545                }
546            }
547
548            /* Other decompositions. */
549            get_decomp_record(self, code, &index, &prefix, &count);
550
551            /* Copy character if it is not decomposable, or has a
552               compatibility decomposition, but we do NFD. */
553            if (!count || (prefix && !k)) {
554                *o++ = code;
555                space--;
556                continue;
557            }
558            /* Copy decomposition onto the stack, in reverse
559               order.  */
560            while(count) {
561                code = decomp_data[index + (--count)];
562                stack[stackptr++] = code;
563            }
564        }
565    }
566
567    /* Drop overallocation. Cannot fail. */
568    PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
569
570    /* Sort canonically. */
571    i = PyUnicode_AS_UNICODE(result);
572    prev = _getrecord_ex(*i)->combining;
573    end = i + PyUnicode_GET_SIZE(result);
574    for (i++; i < end; i++) {
575        cur = _getrecord_ex(*i)->combining;
576        if (prev == 0 || cur == 0 || prev <= cur) {
577            prev = cur;
578            continue;
579        }
580        /* Non-canonical order. Need to switch *i with previous. */
581        o = i - 1;
582        while (1) {
583            Py_UNICODE tmp = o[1];
584            o[1] = o[0];
585            o[0] = tmp;
586            o--;
587            if (o < PyUnicode_AS_UNICODE(result))
588                break;
589            prev = _getrecord_ex(*o)->combining;
590            if (prev == 0 || prev <= cur)
591                break;
592        }
593        prev = _getrecord_ex(*i)->combining;
594    }
595    return result;
596}
597
598static int
599find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
600{
601    int index;
602    for (index = 0; nfc[index].start; index++) {
603        int start = nfc[index].start;
604        if (code < start)
605            return -1;
606        if (code <= start + nfc[index].count) {
607            int delta = code - start;
608            return nfc[index].index + delta;
609        }
610    }
611    return -1;
612}
613
614static PyObject*
615nfc_nfkc(PyObject *self, PyObject *input, int k)
616{
617    PyObject *result;
618    Py_UNICODE *i, *i1, *o, *end;
619    int f,l,index,index1,comb;
620    Py_UNICODE code;
621    Py_UNICODE *skipped[20];
622    int cskipped = 0;
623
624    result = nfd_nfkd(self, input, k);
625    if (!result)
626        return NULL;
627
628    /* We are going to modify result in-place.
629       If nfd_nfkd is changed to sometimes return the input,
630       this code needs to be reviewed. */
631    assert(result != input);
632
633    i = PyUnicode_AS_UNICODE(result);
634    end = i + PyUnicode_GET_SIZE(result);
635    o = PyUnicode_AS_UNICODE(result);
636
637  again:
638    while (i < end) {
639      for (index = 0; index < cskipped; index++) {
640          if (skipped[index] == i) {
641              /* *i character is skipped.
642                 Remove from list. */
643              skipped[index] = skipped[cskipped-1];
644              cskipped--;
645              i++;
646              goto again; /* continue while */
647          }
648      }
649      /* Hangul Composition. We don't need to check for <LV,T>
650         pairs, since we always have decomposed data. */
651      if (LBase <= *i && *i < (LBase+LCount) &&
652          i + 1 < end &&
653          VBase <= i[1] && i[1] <= (VBase+VCount)) {
654          int LIndex, VIndex;
655          LIndex = i[0] - LBase;
656          VIndex = i[1] - VBase;
657          code = SBase + (LIndex*VCount+VIndex)*TCount;
658          i+=2;
659          if (i < end &&
660              TBase <= *i && *i <= (TBase+TCount)) {
661              code += *i-TBase;
662              i++;
663          }
664          *o++ = code;
665          continue;
666      }
667
668      f = find_nfc_index(self, nfc_first, *i);
669      if (f == -1) {
670          *o++ = *i++;
671          continue;
672      }
673      /* Find next unblocked character. */
674      i1 = i+1;
675      comb = 0;
676      while (i1 < end) {
677          int comb1 = _getrecord_ex(*i1)->combining;
678          if (comb1 && comb == comb1) {
679              /* Character is blocked. */
680              i1++;
681              continue;
682          }
683          l = find_nfc_index(self, nfc_last, *i1);
684          /* *i1 cannot be combined with *i. If *i1
685             is a starter, we don't need to look further.
686             Otherwise, record the combining class. */
687          if (l == -1) {
688            not_combinable:
689              if (comb1 == 0)
690                  break;
691              comb = comb1;
692              i1++;
693              continue;
694          }
695          index = f*TOTAL_LAST + l;
696          index1 = comp_index[index >> COMP_SHIFT];
697          code = comp_data[(index1<<COMP_SHIFT)+
698                           (index&((1<<COMP_SHIFT)-1))];
699          if (code == 0)
700              goto not_combinable;
701
702          /* Replace the original character. */
703          *i = code;
704          /* Mark the second character unused. */
705          skipped[cskipped++] = i1;
706          i1++;
707          f = find_nfc_index(self, nfc_first, *i);
708          if (f == -1)
709              break;
710      }
711      *o++ = *i++;
712    }
713    if (o != end)
714        PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
715    return result;
716}
717
718PyDoc_STRVAR(unicodedata_normalize__doc__,
719"normalize(form, unistr)\n\
720\n\
721Return the normal form 'form' for the Unicode string unistr.  Valid\n\
722values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
723
724static PyObject*
725unicodedata_normalize(PyObject *self, PyObject *args)
726{
727    char *form;
728    PyObject *input;
729
730    if(!PyArg_ParseTuple(args, "sO!:normalize",
731                         &form, &PyUnicode_Type, &input))
732        return NULL;
733
734    if (PyUnicode_GetSize(input) == 0) {
735        /* Special case empty input strings, since resizing
736           them  later would cause internal errors. */
737        Py_INCREF(input);
738        return input;
739    }
740
741    if (strcmp(form, "NFC") == 0)
742        return nfc_nfkc(self, input, 0);
743    if (strcmp(form, "NFKC") == 0)
744        return nfc_nfkc(self, input, 1);
745    if (strcmp(form, "NFD") == 0)
746        return nfd_nfkd(self, input, 0);
747    if (strcmp(form, "NFKD") == 0)
748        return nfd_nfkd(self, input, 1);
749    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
750    return NULL;
751}
752
753/* -------------------------------------------------------------------- */
754/* unicode character name tables */
755
756/* data file generated by Tools/unicode/makeunicodedata.py */
757#include "unicodename_db.h"
758
759/* -------------------------------------------------------------------- */
760/* database code (cut and pasted from the unidb package) */
761
762static unsigned long
763_gethash(const char *s, int len, int scale)
764{
765    int i;
766    unsigned long h = 0;
767    unsigned long ix;
768    for (i = 0; i < len; i++) {
769        h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
770        ix = h & 0xff000000;
771        if (ix)
772            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
773    }
774    return h;
775}
776
777static char *hangul_syllables[][3] = {
778    { "G",  "A",   ""   },
779    { "GG", "AE",  "G"  },
780    { "N",  "YA",  "GG" },
781    { "D",  "YAE", "GS" },
782    { "DD", "EO",  "N", },
783    { "R",  "E",   "NJ" },
784    { "M",  "YEO", "NH" },
785    { "B",  "YE",  "D"  },
786    { "BB", "O",   "L"  },
787    { "S",  "WA",  "LG" },
788    { "SS", "WAE", "LM" },
789    { "",   "OE",  "LB" },
790    { "J",  "YO",  "LS" },
791    { "JJ", "U",   "LT" },
792    { "C",  "WEO", "LP" },
793    { "K",  "WE",  "LH" },
794    { "T",  "WI",  "M"  },
795    { "P",  "YU",  "B"  },
796    { "H",  "EU",  "BS" },
797    { 0,    "YI",  "S"  },
798    { 0,    "I",   "SS" },
799    { 0,    0,     "NG" },
800    { 0,    0,     "J"  },
801    { 0,    0,     "C"  },
802    { 0,    0,     "K"  },
803    { 0,    0,     "T"  },
804    { 0,    0,     "P"  },
805    { 0,    0,     "H"  }
806};
807
808static int
809is_unified_ideograph(Py_UCS4 code)
810{
811    return (
812        (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
813        (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
814        (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
815}
816
817static int
818_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
819{
820    int offset;
821    int i;
822    int word;
823    unsigned char* w;
824
825    if (code >= 0x110000)
826        return 0;
827
828    if (self) {
829        const change_record *old = get_old_record(self, code);
830        if (old->category_changed == 0) {
831            /* unassigned */
832            return 0;
833        }
834    }
835
836    if (SBase <= code && code < SBase+SCount) {
837	/* Hangul syllable. */
838	int SIndex = code - SBase;
839	int L = SIndex / NCount;
840	int V = (SIndex % NCount) / TCount;
841	int T = SIndex % TCount;
842
843	if (buflen < 27)
844	    /* Worst case: HANGUL SYLLABLE <10chars>. */
845	    return 0;
846	strcpy(buffer, "HANGUL SYLLABLE ");
847	buffer += 16;
848	strcpy(buffer, hangul_syllables[L][0]);
849	buffer += strlen(hangul_syllables[L][0]);
850	strcpy(buffer, hangul_syllables[V][1]);
851	buffer += strlen(hangul_syllables[V][1]);
852	strcpy(buffer, hangul_syllables[T][2]);
853	buffer += strlen(hangul_syllables[T][2]);
854	*buffer = '\0';
855	return 1;
856    }
857
858    if (is_unified_ideograph(code)) {
859        if (buflen < 28)
860            /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
861            return 0;
862        sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
863        return 1;
864    }
865
866    /* get offset into phrasebook */
867    offset = phrasebook_offset1[(code>>phrasebook_shift)];
868    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
869                               (code&((1<<phrasebook_shift)-1))];
870    if (!offset)
871        return 0;
872
873    i = 0;
874
875    for (;;) {
876        /* get word index */
877        word = phrasebook[offset] - phrasebook_short;
878        if (word >= 0) {
879            word = (word << 8) + phrasebook[offset+1];
880            offset += 2;
881        } else
882            word = phrasebook[offset++];
883        if (i) {
884            if (i > buflen)
885                return 0; /* buffer overflow */
886            buffer[i++] = ' ';
887        }
888        /* copy word string from lexicon.  the last character in the
889           word has bit 7 set.  the last word in a string ends with
890           0x80 */
891        w = lexicon + lexicon_offset[word];
892        while (*w < 128) {
893            if (i >= buflen)
894                return 0; /* buffer overflow */
895            buffer[i++] = *w++;
896        }
897        if (i >= buflen)
898            return 0; /* buffer overflow */
899        buffer[i++] = *w & 127;
900        if (*w == 128)
901            break; /* end of word */
902    }
903
904    return 1;
905}
906
907static int
908_cmpname(PyObject *self, int code, const char* name, int namelen)
909{
910    /* check if code corresponds to the given name */
911    int i;
912    char buffer[NAME_MAXLEN];
913    if (!_getucname(self, code, buffer, sizeof(buffer)))
914        return 0;
915    for (i = 0; i < namelen; i++) {
916        if (toupper(Py_CHARMASK(name[i])) != buffer[i])
917            return 0;
918    }
919    return buffer[namelen] == '\0';
920}
921
922static void
923find_syllable(const char *str, int *len, int *pos, int count, int column)
924{
925    int i, len1;
926    *len = -1;
927    for (i = 0; i < count; i++) {
928	char *s = hangul_syllables[i][column];
929	len1 = strlen(s);
930	if (len1 <= *len)
931	    continue;
932	if (strncmp(str, s, len1) == 0) {
933	    *len = len1;
934	    *pos = i;
935	}
936    }
937    if (*len == -1) {
938	*len = 0;
939    }
940}
941
942static int
943_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
944{
945    unsigned int h, v;
946    unsigned int mask = code_size-1;
947    unsigned int i, incr;
948
949    /* Check for hangul syllables. */
950    if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
951	int len, L = -1, V = -1, T = -1;
952	const char *pos = name + 16;
953	find_syllable(pos, &len, &L, LCount, 0);
954	pos += len;
955	find_syllable(pos, &len, &V, VCount, 1);
956	pos += len;
957	find_syllable(pos, &len, &T, TCount, 2);
958	pos += len;
959	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
960	    *code = SBase + (L*VCount+V)*TCount + T;
961	    return 1;
962	}
963        /* Otherwise, it's an illegal syllable name. */
964        return 0;
965    }
966
967    /* Check for unified ideographs. */
968    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
969        /* Four or five hexdigits must follow. */
970        v = 0;
971        name += 22;
972        namelen -= 22;
973        if (namelen != 4 && namelen != 5)
974            return 0;
975        while (namelen--) {
976            v *= 16;
977            if (*name >= '0' && *name <= '9')
978                v += *name - '0';
979            else if (*name >= 'A' && *name <= 'F')
980                v += *name - 'A' + 10;
981            else
982                return 0;
983            name++;
984        }
985        if (!is_unified_ideograph(v))
986            return 0;
987        *code = v;
988        return 1;
989    }
990
991    /* the following is the same as python's dictionary lookup, with
992       only minor changes.  see the makeunicodedata script for more
993       details */
994
995    h = (unsigned int) _gethash(name, namelen, code_magic);
996    i = (~h) & mask;
997    v = code_hash[i];
998    if (!v)
999        return 0;
1000    if (_cmpname(self, v, name, namelen)) {
1001        *code = v;
1002        return 1;
1003    }
1004    incr = (h ^ (h >> 3)) & mask;
1005    if (!incr)
1006        incr = mask;
1007    for (;;) {
1008        i = (i + incr) & mask;
1009        v = code_hash[i];
1010        if (!v)
1011            return 0;
1012        if (_cmpname(self, v, name, namelen)) {
1013            *code = v;
1014            return 1;
1015        }
1016        incr = incr << 1;
1017        if (incr > mask)
1018            incr = incr ^ code_poly;
1019    }
1020}
1021
1022static const _PyUnicode_Name_CAPI hashAPI =
1023{
1024    sizeof(_PyUnicode_Name_CAPI),
1025    _getucname,
1026    _getcode
1027};
1028
1029/* -------------------------------------------------------------------- */
1030/* Python bindings */
1031
1032PyDoc_STRVAR(unicodedata_name__doc__,
1033"name(unichr[, default])\n\
1034Returns the name assigned to the Unicode character unichr as a\n\
1035string. If no name is defined, default is returned, or, if not\n\
1036given, ValueError is raised.");
1037
1038static PyObject *
1039unicodedata_name(PyObject* self, PyObject* args)
1040{
1041    char name[NAME_MAXLEN];
1042
1043    PyUnicodeObject* v;
1044    PyObject* defobj = NULL;
1045    if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1046        return NULL;
1047
1048    if (PyUnicode_GET_SIZE(v) != 1) {
1049	PyErr_SetString(PyExc_TypeError,
1050			"need a single Unicode character as parameter");
1051	return NULL;
1052    }
1053
1054    if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
1055                    name, sizeof(name))) {
1056	if (defobj == NULL) {
1057	    PyErr_SetString(PyExc_ValueError, "no such name");
1058            return NULL;
1059	}
1060	else {
1061	    Py_INCREF(defobj);
1062	    return defobj;
1063	}
1064    }
1065
1066    return Py_BuildValue("s", name);
1067}
1068
1069PyDoc_STRVAR(unicodedata_lookup__doc__,
1070"lookup(name)\n\
1071\n\
1072Look up character by name.  If a character with the\n\
1073given name is found, return the corresponding Unicode\n\
1074character.  If not found, KeyError is raised.");
1075
1076static PyObject *
1077unicodedata_lookup(PyObject* self, PyObject* args)
1078{
1079    Py_UCS4 code;
1080    Py_UNICODE str[1];
1081    char errbuf[256];
1082
1083    char* name;
1084    int namelen;
1085    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1086        return NULL;
1087
1088    if (!_getcode(self, name, namelen, &code)) {
1089	/* XXX(nnorwitz): why are we allocating for the error msg?
1090		Why not always use snprintf? */
1091        char fmt[] = "undefined character name '%s'";
1092        char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
1093        if (buf)
1094            sprintf(buf, fmt, name);
1095        else {
1096            buf = errbuf;
1097            PyOS_snprintf(buf, sizeof(errbuf), fmt, name);
1098        }
1099        PyErr_SetString(PyExc_KeyError, buf);
1100        if (buf != errbuf)
1101        	PyMem_FREE(buf);
1102        return NULL;
1103    }
1104
1105    str[0] = (Py_UNICODE) code;
1106    return PyUnicode_FromUnicode(str, 1);
1107}
1108
1109/* XXX Add doc strings. */
1110
1111static PyMethodDef unicodedata_functions[] = {
1112    {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1113    {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1114    {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1115    {"category", unicodedata_category, METH_VARARGS,
1116                 unicodedata_category__doc__},
1117    {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1118                      unicodedata_bidirectional__doc__},
1119    {"combining", unicodedata_combining, METH_VARARGS,
1120                  unicodedata_combining__doc__},
1121    {"mirrored", unicodedata_mirrored, METH_VARARGS,
1122                 unicodedata_mirrored__doc__},
1123    {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1124                         unicodedata_east_asian_width__doc__},
1125    {"decomposition", unicodedata_decomposition, METH_VARARGS,
1126                      unicodedata_decomposition__doc__},
1127    {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1128    {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1129    {"normalize", unicodedata_normalize, METH_VARARGS,
1130                  unicodedata_normalize__doc__},
1131    {NULL, NULL}		/* sentinel */
1132};
1133
1134static PyTypeObject UCD_Type = {
1135	/* The ob_type field must be initialized in the module init function
1136	 * to be portable to Windows without using C++. */
1137	PyObject_HEAD_INIT(NULL)
1138	0,			/*ob_size*/
1139	"unicodedata.UCD",		/*tp_name*/
1140	sizeof(PreviousDBVersion),	/*tp_basicsize*/
1141	0,			/*tp_itemsize*/
1142	/* methods */
1143	(destructor)PyObject_Del, /*tp_dealloc*/
1144	0,			/*tp_print*/
1145	0,                      /*tp_getattr*/
1146	0,			/*tp_setattr*/
1147	0,			/*tp_compare*/
1148	0,			/*tp_repr*/
1149	0,			/*tp_as_number*/
1150	0,			/*tp_as_sequence*/
1151	0,			/*tp_as_mapping*/
1152	0,			/*tp_hash*/
1153        0,                      /*tp_call*/
1154        0,                      /*tp_str*/
1155        PyObject_GenericGetAttr,/*tp_getattro*/
1156        0,                      /*tp_setattro*/
1157        0,                      /*tp_as_buffer*/
1158        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1159        0,                      /*tp_doc*/
1160        0,                      /*tp_traverse*/
1161        0,                      /*tp_clear*/
1162        0,                      /*tp_richcompare*/
1163        0,                      /*tp_weaklistoffset*/
1164        0,                      /*tp_iter*/
1165        0,                      /*tp_iternext*/
1166        unicodedata_functions,  /*tp_methods*/
1167        DB_members,             /*tp_members*/
1168        0,                      /*tp_getset*/
1169        0,                      /*tp_base*/
1170        0,                      /*tp_dict*/
1171        0,                      /*tp_descr_get*/
1172        0,                      /*tp_descr_set*/
1173        0,                      /*tp_dictoffset*/
1174        0,                      /*tp_init*/
1175        0,                      /*tp_alloc*/
1176        0,                      /*tp_new*/
1177        0,                      /*tp_free*/
1178        0,                      /*tp_is_gc*/
1179};
1180
1181PyDoc_STRVAR(unicodedata_docstring,
1182"This module provides access to the Unicode Character Database which\n\
1183defines character properties for all Unicode characters. The data in\n\
1184this database is based on the UnicodeData.txt file version\n\
11854.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1186\n\
1187The module uses the same names and symbols as defined by the\n\
1188UnicodeData File Format 4.1.0 (see\n\
1189http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
1190
1191PyMODINIT_FUNC
1192initunicodedata(void)
1193{
1194    PyObject *m, *v;
1195
1196    UCD_Type.ob_type = &PyType_Type;
1197
1198    m = Py_InitModule3(
1199        "unicodedata", unicodedata_functions, unicodedata_docstring);
1200    if (!m)
1201        return;
1202
1203    PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1204    Py_INCREF(&UCD_Type);
1205    PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1206
1207    /* Previous versions */
1208    v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1209    if (v != NULL)
1210        PyModule_AddObject(m, "ucd_3_2_0", v);
1211
1212    /* Export C API */
1213    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1214    if (v != NULL)
1215        PyModule_AddObject(m, "ucnhash_CAPI", v);
1216}
1217
1218/*
1219Local variables:
1220c-basic-offset: 4
1221indent-tabs-mode: nil
1222End:
1223*/
1224