test_normalization.py revision 1b445d3fcfcc06e5360e83b978efdb9b1c980278
1from test.test_support import verbose, TestFailed, TestSkipped, verify 2import sys 3import os 4from unicodedata import normalize 5 6TESTDATAFILE = "NormalizationTest.txt" 7skip_expected = not os.path.exists(TESTDATAFILE) 8 9class RangeError: 10 pass 11 12def NFC(str): 13 return normalize("NFC", str) 14 15def NFKC(str): 16 return normalize("NFKC", str) 17 18def NFD(str): 19 return normalize("NFD", str) 20 21def NFKD(str): 22 return normalize("NFKD", str) 23 24def unistr(data): 25 data = [int(x, 16) for x in data.split(" ")] 26 for x in data: 27 if x > sys.maxunicode: 28 raise RangeError 29 return u"".join([unichr(x) for x in data]) 30 31def test_main(): 32 if skip_expected: 33 raise TestSkipped(TESTDATAFILE + " not found, download from " + 34 "http://www.unicode.org/Public/UNIDATA/" + TESTDATAFILE) 35 36 data = open(TESTDATAFILE).readlines() 37 38 part1_data = {} 39 for line in data: 40 if '#' in line: 41 line = line.split('#')[0] 42 line = line.strip() 43 if not line: 44 continue 45 if line.startswith("@Part"): 46 part = line 47 continue 48 try: 49 c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] 50 except RangeError: 51 # Skip unsupported characters 52 continue 53 54 if verbose: 55 print line 56 57 # Perform tests 58 verify(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 59 verify(c4 == NFC(c4) == NFC(c5), line) 60 verify(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 61 verify(c5 == NFD(c4) == NFD(c5), line) 62 verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), 63 line) 64 verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), 65 line) 66 67 # Record part 1 data 68 if part == "@Part1": 69 part1_data[c1] = 1 70 71 # Perform tests for all other data 72 for c in range(sys.maxunicode+1): 73 X = unichr(c) 74 if X in part1_data: 75 continue 76 assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c 77 78if __name__ == "__main__": 79 test_main() 80