test_normalization.py revision 1b445d3fcfcc06e5360e83b978efdb9b1c980278
1677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfrom test.test_support import verbose, TestFailed, TestSkipped, verify
2677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisimport sys
31b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersimport os
4677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfrom unicodedata import normalize
51b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters
61b445d3fcfcc06e5360e83b978efdb9b1c980278Tim PetersTESTDATAFILE = "NormalizationTest.txt"
71b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersskip_expected = not os.path.exists(TESTDATAFILE)
8677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
9677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisclass RangeError:
10677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    pass
11677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
12677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFC(str):
13677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFC", str)
14677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
15677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKC(str):
16677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFKC", str)
17677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
18677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFD(str):
19677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFD", str)
20677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
21677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKD(str):
22677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFKD", str)
23677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
24677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef unistr(data):
25677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    data = [int(x, 16) for x in data.split(" ")]
26677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    for x in data:
27677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        if x > sys.maxunicode:
28677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis            raise RangeError
29677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return u"".join([unichr(x) for x in data])
30677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
311b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersdef test_main():
321b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    if skip_expected:
331b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        raise TestSkipped(TESTDATAFILE + " not found, download from " +
341b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters                    "http://www.unicode.org/Public/UNIDATA/" + TESTDATAFILE)
351b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters
361b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    data = open(TESTDATAFILE).readlines()
371b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters
381b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    part1_data = {}
391b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    for line in data:
401b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if '#' in line:
411b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            line = line.split('#')[0]
421b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        line = line.strip()
431b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if not line:
441b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            continue
451b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if line.startswith("@Part"):
461b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            part = line
471b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            continue
481b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        try:
491b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
501b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        except RangeError:
511b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            # Skip unsupported characters
521b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            continue
531b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters
541b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if verbose:
551b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            print line
56677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
571b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        # Perform tests
581b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
591b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c4 ==  NFC(c4) ==  NFC(c5), line)
601b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
611b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c5 ==  NFD(c4) ==  NFD(c5), line)
621b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5),
631b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters               line)
641b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5),
651b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters               line)
6677c06fbf942bf4c532d0f8d6f254882a9e5957ecTim Peters
671b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        # Record part 1 data
681b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if part == "@Part1":
691b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            part1_data[c1] = 1
70677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
711b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    # Perform tests for all other data
721b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    for c in range(sys.maxunicode+1):
731b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        X = unichr(c)
741b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if X in part1_data:
751b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            continue
761b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c
77677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
781b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersif __name__ == "__main__":
791b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    test_main()
80