test_normalization.py revision 677bde2dd14ac2c8f170779adcc732f991db8bd6
1from test.test_support import verbose, TestFailed, TestSkipped, verify
2import sys
3from unicodedata import normalize
4try:
5    data = open("NormalizationTest.txt","r").readlines()
6except IOError:
7    raise TestSkipped("NormalizationTest.txt not found, download from http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt")
8
9class RangeError:
10    pass
11
12def NFC(str):
13    return normalize("NFC", str)
14
15def NFKC(str):
16    return normalize("NFKC", str)
17
18def NFD(str):
19    return normalize("NFD", str)
20
21def NFKD(str):
22    return normalize("NFKD", str)
23
24def unistr(data):
25    data = [int(x, 16) for x in data.split(" ")]
26    for x in data:
27        if x > sys.maxunicode:
28            raise RangeError
29    return u"".join([unichr(x) for x in data])
30
31part1_data = {}
32for line in data:
33    if '#' in line:
34        line = line.split('#')[0]
35    line = line.strip()
36    if not line:
37        continue
38    if line.startswith("@Part"):
39        part = line
40        continue
41    try:
42        c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
43    except RangeError:
44        # Skip unsupported characters
45        continue
46
47    if verbose:
48        print line
49
50    # Perform tests
51    verify(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
52    verify(c4 ==  NFC(c4) ==  NFC(c5), line)
53    verify(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
54    verify(c5 ==  NFD(c4) ==  NFD(c5), line)
55    verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), line)
56    verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), line)
57
58    # Record part 1 data
59    if part == "@Part1":
60        part1_data[c1] = 1
61
62# Perform tests for all other data
63for c in range(sys.maxunicode+1):
64    X = unichr(c)
65    if X in part1_data:
66        continue
67    assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c
68
69