test_normalization.py revision 84fc66dd020931c14be8b13fcbdb9a8f295141c9
1from test.test_support import run_unittest, open_urlresource
2import unittest
3
4import sys
5import os
6from unicodedata import normalize
7
8TESTDATAFILE = "NormalizationTest" + os.extsep + "txt"
9TESTDATAURL = "http://www.unicode.org/Public/4.1.0/ucd/" + TESTDATAFILE
10
11class RangeError(Exception):
12    pass
13
14def NFC(str):
15    return normalize("NFC", str)
16
17def NFKC(str):
18    return normalize("NFKC", str)
19
20def NFD(str):
21    return normalize("NFD", str)
22
23def NFKD(str):
24    return normalize("NFKD", str)
25
26def unistr(data):
27    data = [int(x, 16) for x in data.split(" ")]
28    for x in data:
29        if x > sys.maxunicode:
30            raise RangeError
31    return "".join([chr(x) for x in data])
32
33class NormalizationTest(unittest.TestCase):
34    def test_main(self):
35        part1_data = {}
36        for line in open_urlresource(TESTDATAURL):
37            if '#' in line:
38                line = line.split('#')[0]
39            line = line.strip()
40            if not line:
41                continue
42            if line.startswith("@Part"):
43                part = line.split()[0]
44                continue
45            if part == "@Part3":
46                # XXX we don't support PRI #29 yet, so skip these tests for now
47                continue
48            try:
49                c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
50            except RangeError:
51                # Skip unsupported characters;
52                # try atleast adding c1 if we are in part1
53                if part == "@Part1":
54                    try:
55                        c1 = unistr(line.split(';')[0])
56                    except RangeError:
57                        pass
58                    else:
59                        part1_data[c1] = 1
60                continue
61
62            # Perform tests
63            self.failUnless(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
64            self.failUnless(c4 ==  NFC(c4) ==  NFC(c5), line)
65            self.failUnless(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
66            self.failUnless(c5 ==  NFD(c4) ==  NFD(c5), line)
67            self.failUnless(c4 == NFKC(c1) == NFKC(c2) == \
68                            NFKC(c3) == NFKC(c4) == NFKC(c5),
69                            line)
70            self.failUnless(c5 == NFKD(c1) == NFKD(c2) == \
71                            NFKD(c3) == NFKD(c4) == NFKD(c5),
72                            line)
73
74            # Record part 1 data
75            if part == "@Part1":
76                part1_data[c1] = 1
77
78        # Perform tests for all other data
79        for c in range(sys.maxunicode+1):
80            X = chr(c)
81            if X in part1_data:
82                continue
83            self.failUnless(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
84
85    def test_bug_834676(self):
86        # Check for bug 834676
87        normalize('NFC', '\ud55c\uae00')
88
89
90def test_main():
91    # Hit the exception early
92    open_urlresource(TESTDATAURL)
93    run_unittest(NormalizationTest)
94
95if __name__ == "__main__":
96    test_main()
97