test_normalization.py revision 7e05e7d3b64b3874b8086e607601edd0f46666bd
1from test.support import run_unittest, open_urlresource
2import unittest
3
4import sys
5import os
6from unicodedata import normalize, unidata_version
7
8TESTDATAFILE = "NormalizationTest.txt"
9TESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE
10
11# Verify we have the correct version of the test data file.
12TESTDATAPATH = os.path.join(os.path.dirname(__file__), "data", TESTDATAFILE)
13if os.path.exists(TESTDATAPATH):
14    f = open(TESTDATAPATH, encoding='utf-8')
15    l = f.readline()
16    f.close()
17    if not unidata_version in l:
18        os.unlink(testdatafile)
19
20class RangeError(Exception):
21    pass
22
23def NFC(str):
24    return normalize("NFC", str)
25
26def NFKC(str):
27    return normalize("NFKC", str)
28
29def NFD(str):
30    return normalize("NFD", str)
31
32def NFKD(str):
33    return normalize("NFKD", str)
34
35def unistr(data):
36    data = [int(x, 16) for x in data.split(" ")]
37    for x in data:
38        if x > sys.maxunicode:
39            raise RangeError
40    return "".join([chr(x) for x in data])
41
42class NormalizationTest(unittest.TestCase):
43    def test_main(self):
44        part1_data = {}
45        # Hit the exception early
46        try:
47            open_urlresource(TESTDATAURL, encoding="utf-8")
48        except IOError:
49            self.skipTest("Could not retrieve " + TESTDATAURL)
50        for line in open_urlresource(TESTDATAURL, encoding="utf-8"):
51            if '#' in line:
52                line = line.split('#')[0]
53            line = line.strip()
54            if not line:
55                continue
56            if line.startswith("@Part"):
57                part = line.split()[0]
58                continue
59            if part == "@Part3":
60                # XXX we don't support PRI #29 yet, so skip these tests for now
61                continue
62            try:
63                c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
64            except RangeError:
65                # Skip unsupported characters;
66                # try atleast adding c1 if we are in part1
67                if part == "@Part1":
68                    try:
69                        c1 = unistr(line.split(';')[0])
70                    except RangeError:
71                        pass
72                    else:
73                        part1_data[c1] = 1
74                continue
75
76            # Perform tests
77            self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
78            self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
79            self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
80            self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
81            self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
82                            NFKC(c3) == NFKC(c4) == NFKC(c5),
83                            line)
84            self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
85                            NFKD(c3) == NFKD(c4) == NFKD(c5),
86                            line)
87
88            # Record part 1 data
89            if part == "@Part1":
90                part1_data[c1] = 1
91
92        # Perform tests for all other data
93        for c in range(sys.maxunicode+1):
94            X = chr(c)
95            if X in part1_data:
96                continue
97            self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
98
99    def test_bug_834676(self):
100        # Check for bug 834676
101        normalize('NFC', '\ud55c\uae00')
102
103
104def test_main():
105    run_unittest(NormalizationTest)
106
107if __name__ == "__main__":
108    test_main()
109