test_normalization.py revision 1b445d3fcfcc06e5360e83b978efdb9b1c980278
1677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfrom test.test_support import verbose, TestFailed, TestSkipped, verify 2677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisimport sys 31b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersimport os 4677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfrom unicodedata import normalize 51b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters 61b445d3fcfcc06e5360e83b978efdb9b1c980278Tim PetersTESTDATAFILE = "NormalizationTest.txt" 71b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersskip_expected = not os.path.exists(TESTDATAFILE) 8677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 9677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisclass RangeError: 10677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis pass 11677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 12677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFC(str): 13677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFC", str) 14677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 15677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKC(str): 16677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFKC", str) 17677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 18677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFD(str): 19677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFD", str) 20677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 21677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKD(str): 22677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFKD", str) 23677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 24677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef unistr(data): 25677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis data = [int(x, 16) for x in data.split(" ")] 26677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis for x in data: 27677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis if x > sys.maxunicode: 28677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis raise RangeError 29677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return u"".join([unichr(x) for x in data]) 30677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 311b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersdef test_main(): 321b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if skip_expected: 331b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters raise TestSkipped(TESTDATAFILE + " not found, download from " + 341b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters "http://www.unicode.org/Public/UNIDATA/" + TESTDATAFILE) 351b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters 361b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters data = open(TESTDATAFILE).readlines() 371b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters 381b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters part1_data = {} 391b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters for line in data: 401b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if '#' in line: 411b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters line = line.split('#')[0] 421b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters line = line.strip() 431b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if not line: 441b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters continue 451b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if line.startswith("@Part"): 461b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters part = line 471b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters continue 481b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters try: 491b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] 501b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters except RangeError: 511b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters # Skip unsupported characters 521b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters continue 531b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters 541b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if verbose: 551b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters print line 56677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 571b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters # Perform tests 581b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 591b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c4 == NFC(c4) == NFC(c5), line) 601b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 611b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c5 == NFD(c4) == NFD(c5), line) 621b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), 631b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters line) 641b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), 651b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters line) 6677c06fbf942bf4c532d0f8d6f254882a9e5957ecTim Peters 671b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters # Record part 1 data 681b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if part == "@Part1": 691b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters part1_data[c1] = 1 70677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 711b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters # Perform tests for all other data 721b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters for c in range(sys.maxunicode+1): 731b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters X = unichr(c) 741b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if X in part1_data: 751b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters continue 761b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c 77677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 781b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersif __name__ == "__main__": 791b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters test_main() 80