test_normalization.py revision d2171d2ba414def2ecf27b694ea27c2e9fde0fcf
1from test.test_support import verbose, TestFailed, TestSkipped, verify
2import sys
3import os
4from unicodedata import normalize
5
6TESTDATAFILE = "NormalizationTest-3.2.0" + os.extsep + "txt"
7
8# This search allows using a build directory just inside the source
9# directory, and saving just one copy of the test data in the source
10# tree, rather than having a copy in each build directory.
11# There might be a better way to do this.
12
13for path in [os.path.curdir, os.path.pardir]:
14    fn = os.path.join(path, TESTDATAFILE)
15    skip_expected = not os.path.exists(fn)
16    if not skip_expected:
17        TESTDATAFILE = fn
18        break
19
20class RangeError:
21    pass
22
23def NFC(str):
24    return normalize("NFC", str)
25
26def NFKC(str):
27    return normalize("NFKC", str)
28
29def NFD(str):
30    return normalize("NFD", str)
31
32def NFKD(str):
33    return normalize("NFKD", str)
34
35def unistr(data):
36    data = [int(x, 16) for x in data.split(" ")]
37    for x in data:
38        if x > sys.maxunicode:
39            raise RangeError
40    return u"".join([unichr(x) for x in data])
41
42def test_main():
43    if skip_expected:
44        raise TestSkipped(TESTDATAFILE + " not found, download from " +
45                    "http://www.unicode.org/Public/3.2-Update/" + TESTDATAFILE)
46
47    part1_data = {}
48    for line in open(TESTDATAFILE):
49        if '#' in line:
50            line = line.split('#')[0]
51        line = line.strip()
52        if not line:
53            continue
54        if line.startswith("@Part"):
55            part = line
56            continue
57        try:
58            c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
59        except RangeError:
60            # Skip unsupported characters
61            continue
62
63        if verbose:
64            print line
65
66        # Perform tests
67        verify(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
68        verify(c4 ==  NFC(c4) ==  NFC(c5), line)
69        verify(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
70        verify(c5 ==  NFD(c4) ==  NFD(c5), line)
71        verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5),
72               line)
73        verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5),
74               line)
75
76        # Record part 1 data
77        if part == "@Part1":
78            part1_data[c1] = 1
79
80    # Perform tests for all other data
81    for c in range(sys.maxunicode+1):
82        X = unichr(c)
83        if X in part1_data:
84            continue
85        assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c
86
87    # Check for bug 834676
88    normalize('NFC',u'\ud55c\uae00')
89
90if __name__ == "__main__":
91    test_main()
92