test_normalization.py revision 1b445d3fcfcc06e5360e83b978efdb9b1c980278
1from test.test_support import verbose, TestFailed, TestSkipped, verify
2import sys
3import os
4from unicodedata import normalize
5
6TESTDATAFILE = "NormalizationTest.txt"
7skip_expected = not os.path.exists(TESTDATAFILE)
8
9class RangeError:
10    pass
11
12def NFC(str):
13    return normalize("NFC", str)
14
15def NFKC(str):
16    return normalize("NFKC", str)
17
18def NFD(str):
19    return normalize("NFD", str)
20
21def NFKD(str):
22    return normalize("NFKD", str)
23
24def unistr(data):
25    data = [int(x, 16) for x in data.split(" ")]
26    for x in data:
27        if x > sys.maxunicode:
28            raise RangeError
29    return u"".join([unichr(x) for x in data])
30
31def test_main():
32    if skip_expected:
33        raise TestSkipped(TESTDATAFILE + " not found, download from " +
34                    "http://www.unicode.org/Public/UNIDATA/" + TESTDATAFILE)
35
36    data = open(TESTDATAFILE).readlines()
37
38    part1_data = {}
39    for line in data:
40        if '#' in line:
41            line = line.split('#')[0]
42        line = line.strip()
43        if not line:
44            continue
45        if line.startswith("@Part"):
46            part = line
47            continue
48        try:
49            c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
50        except RangeError:
51            # Skip unsupported characters
52            continue
53
54        if verbose:
55            print line
56
57        # Perform tests
58        verify(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
59        verify(c4 ==  NFC(c4) ==  NFC(c5), line)
60        verify(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
61        verify(c5 ==  NFD(c4) ==  NFD(c5), line)
62        verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5),
63               line)
64        verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5),
65               line)
66
67        # Record part 1 data
68        if part == "@Part1":
69            part1_data[c1] = 1
70
71    # Perform tests for all other data
72    for c in range(sys.maxunicode+1):
73        X = unichr(c)
74        if X in part1_data:
75            continue
76        assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c
77
78if __name__ == "__main__":
79    test_main()
80