test_normalization.py revision 419e23cbb07a624609a219919eaecd3c61d8e9b4
1from test.test_support import run_unittest, open_urlresource
2import unittest
3
4from httplib import HTTPException
5import sys
6import os
7from unicodedata import normalize, unidata_version
8
9TESTDATAFILE = "NormalizationTest.txt"
10TESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE
11
12def check_version(testfile):
13    hdr = testfile.readline()
14    return unidata_version in hdr
15
16class RangeError(Exception):
17    pass
18
19def NFC(str):
20    return normalize("NFC", str)
21
22def NFKC(str):
23    return normalize("NFKC", str)
24
25def NFD(str):
26    return normalize("NFD", str)
27
28def NFKD(str):
29    return normalize("NFKD", str)
30
31def unistr(data):
32    data = [int(x, 16) for x in data.split(" ")]
33    for x in data:
34        if x > sys.maxunicode:
35            raise RangeError
36    return u"".join([unichr(x) for x in data])
37
38class NormalizationTest(unittest.TestCase):
39    def test_main(self):
40        part = None
41        part1_data = {}
42        # Hit the exception early
43        try:
44            testdata = open_urlresource(TESTDATAURL, check_version)
45        except (IOError, HTTPException):
46            self.skipTest("Could not retrieve " + TESTDATAURL)
47        for line in testdata:
48            if '#' in line:
49                line = line.split('#')[0]
50            line = line.strip()
51            if not line:
52                continue
53            if line.startswith("@Part"):
54                part = line.split()[0]
55                continue
56            try:
57                c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
58            except RangeError:
59                # Skip unsupported characters;
60                # try at least adding c1 if we are in part1
61                if part == "@Part1":
62                    try:
63                        c1 = unistr(line.split(';')[0])
64                    except RangeError:
65                        pass
66                    else:
67                        part1_data[c1] = 1
68                continue
69
70            # Perform tests
71            self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
72            self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
73            self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
74            self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
75            self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
76                            NFKC(c3) == NFKC(c4) == NFKC(c5),
77                            line)
78            self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
79                            NFKD(c3) == NFKD(c4) == NFKD(c5),
80                            line)
81
82            # Record part 1 data
83            if part == "@Part1":
84                part1_data[c1] = 1
85
86        # Perform tests for all other data
87        for c in range(sys.maxunicode+1):
88            X = unichr(c)
89            if X in part1_data:
90                continue
91            self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
92
93    def test_bug_834676(self):
94        # Check for bug 834676
95        normalize('NFC', u'\ud55c\uae00')
96
97
98def test_main():
99    run_unittest(NormalizationTest)
100
101if __name__ == "__main__":
102    test_main()
103