test_normalization.py revision f7a17b48d748e1835bcf9df86fb7fb318bb020f8
1from test.support import run_unittest, open_urlresource
2import unittest
3
4from http.client import HTTPException
5import sys
6import os
7from unicodedata import normalize, unidata_version
8
9TESTDATAFILE = "NormalizationTest.txt"
10TESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE
11
12def check_version(testfile):
13    hdr = testfile.readline()
14    return unidata_version in hdr
15
16class RangeError(Exception):
17    pass
18
19def NFC(str):
20    return normalize("NFC", str)
21
22def NFKC(str):
23    return normalize("NFKC", str)
24
25def NFD(str):
26    return normalize("NFD", str)
27
28def NFKD(str):
29    return normalize("NFKD", str)
30
31def unistr(data):
32    data = [int(x, 16) for x in data.split(" ")]
33    for x in data:
34        if x > sys.maxunicode:
35            raise RangeError
36    return "".join([chr(x) for x in data])
37
38class NormalizationTest(unittest.TestCase):
39    def test_main(self):
40        part = None
41        part1_data = {}
42        # Hit the exception early
43        try:
44            testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
45                                        check=check_version)
46        except (OSError, HTTPException):
47            self.skipTest("Could not retrieve " + TESTDATAURL)
48        self.addCleanup(testdata.close)
49        for line in testdata:
50            if '#' in line:
51                line = line.split('#')[0]
52            line = line.strip()
53            if not line:
54                continue
55            if line.startswith("@Part"):
56                part = line.split()[0]
57                continue
58            try:
59                c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
60            except RangeError:
61                # Skip unsupported characters;
62                # try atleast adding c1 if we are in part1
63                if part == "@Part1":
64                    try:
65                        c1 = unistr(line.split(';')[0])
66                    except RangeError:
67                        pass
68                    else:
69                        part1_data[c1] = 1
70                continue
71
72            # Perform tests
73            self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
74            self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
75            self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
76            self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
77            self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
78                            NFKC(c3) == NFKC(c4) == NFKC(c5),
79                            line)
80            self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
81                            NFKD(c3) == NFKD(c4) == NFKD(c5),
82                            line)
83
84            # Record part 1 data
85            if part == "@Part1":
86                part1_data[c1] = 1
87
88        # Perform tests for all other data
89        for c in range(sys.maxunicode+1):
90            X = chr(c)
91            if X in part1_data:
92                continue
93            self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
94
95    def test_bug_834676(self):
96        # Check for bug 834676
97        normalize('NFC', '\ud55c\uae00')
98
99
100def test_main():
101    run_unittest(NormalizationTest)
102
103if __name__ == "__main__":
104    test_main()
105