test_normalization.py revision 677bde2dd14ac2c8f170779adcc732f991db8bd6
1from test.test_support import verbose, TestFailed, TestSkipped, verify 2import sys 3from unicodedata import normalize 4try: 5 data = open("NormalizationTest.txt","r").readlines() 6except IOError: 7 raise TestSkipped("NormalizationTest.txt not found, download from http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt") 8 9class RangeError: 10 pass 11 12def NFC(str): 13 return normalize("NFC", str) 14 15def NFKC(str): 16 return normalize("NFKC", str) 17 18def NFD(str): 19 return normalize("NFD", str) 20 21def NFKD(str): 22 return normalize("NFKD", str) 23 24def unistr(data): 25 data = [int(x, 16) for x in data.split(" ")] 26 for x in data: 27 if x > sys.maxunicode: 28 raise RangeError 29 return u"".join([unichr(x) for x in data]) 30 31part1_data = {} 32for line in data: 33 if '#' in line: 34 line = line.split('#')[0] 35 line = line.strip() 36 if not line: 37 continue 38 if line.startswith("@Part"): 39 part = line 40 continue 41 try: 42 c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] 43 except RangeError: 44 # Skip unsupported characters 45 continue 46 47 if verbose: 48 print line 49 50 # Perform tests 51 verify(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 52 verify(c4 == NFC(c4) == NFC(c5), line) 53 verify(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 54 verify(c5 == NFD(c4) == NFD(c5), line) 55 verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), line) 56 verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), line) 57 58 # Record part 1 data 59 if part == "@Part1": 60 part1_data[c1] = 1 61 62# Perform tests for all other data 63for c in range(sys.maxunicode+1): 64 X = unichr(c) 65 if X in part1_data: 66 continue 67 assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c 68 69