test_normalization.py revision 77c06fbf942bf4c532d0f8d6f254882a9e5957ec
1677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfrom test.test_support import verbose, TestFailed, TestSkipped, verify
2677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisimport sys
3677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfrom unicodedata import normalize
4677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwistry:
5677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    data = open("NormalizationTest.txt","r").readlines()
6677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisexcept IOError:
7677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    raise TestSkipped("NormalizationTest.txt not found, download from http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt")
8677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
9677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisclass RangeError:
10677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    pass
11677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
12677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFC(str):
13677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFC", str)
14677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
15677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKC(str):
16677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFKC", str)
17677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
18677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFD(str):
19677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFD", str)
20677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
21677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKD(str):
22677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFKD", str)
23677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
24677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef unistr(data):
25677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    data = [int(x, 16) for x in data.split(" ")]
26677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    for x in data:
27677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        if x > sys.maxunicode:
28677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis            raise RangeError
29677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return u"".join([unichr(x) for x in data])
30677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
31677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwispart1_data = {}
32677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfor line in data:
33677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    if '#' in line:
34677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        line = line.split('#')[0]
35677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    line = line.strip()
36677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    if not line:
37677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        continue
38677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    if line.startswith("@Part"):
39677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        part = line
40677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        continue
41677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    try:
42677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
43677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    except RangeError:
44677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        # Skip unsupported characters
45677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        continue
46677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
47677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    if verbose:
48677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        print line
4977c06fbf942bf4c532d0f8d6f254882a9e5957ecTim Peters
50677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    # Perform tests
51677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    verify(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
52677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    verify(c4 ==  NFC(c4) ==  NFC(c5), line)
53677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    verify(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
54677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    verify(c5 ==  NFD(c4) ==  NFD(c5), line)
55677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), line)
56677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), line)
57677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
58677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    # Record part 1 data
59677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    if part == "@Part1":
60677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        part1_data[c1] = 1
61677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
62677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis# Perform tests for all other data
63677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfor c in range(sys.maxunicode+1):
64677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    X = unichr(c)
65677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    if X in part1_data:
66677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        continue
67677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c
68