test_normalization.py revision 84fc66dd020931c14be8b13fcbdb9a8f295141c9
1from test.test_support import run_unittest, open_urlresource 2import unittest 3 4import sys 5import os 6from unicodedata import normalize 7 8TESTDATAFILE = "NormalizationTest" + os.extsep + "txt" 9TESTDATAURL = "http://www.unicode.org/Public/4.1.0/ucd/" + TESTDATAFILE 10 11class RangeError(Exception): 12 pass 13 14def NFC(str): 15 return normalize("NFC", str) 16 17def NFKC(str): 18 return normalize("NFKC", str) 19 20def NFD(str): 21 return normalize("NFD", str) 22 23def NFKD(str): 24 return normalize("NFKD", str) 25 26def unistr(data): 27 data = [int(x, 16) for x in data.split(" ")] 28 for x in data: 29 if x > sys.maxunicode: 30 raise RangeError 31 return "".join([chr(x) for x in data]) 32 33class NormalizationTest(unittest.TestCase): 34 def test_main(self): 35 part1_data = {} 36 for line in open_urlresource(TESTDATAURL): 37 if '#' in line: 38 line = line.split('#')[0] 39 line = line.strip() 40 if not line: 41 continue 42 if line.startswith("@Part"): 43 part = line.split()[0] 44 continue 45 if part == "@Part3": 46 # XXX we don't support PRI #29 yet, so skip these tests for now 47 continue 48 try: 49 c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] 50 except RangeError: 51 # Skip unsupported characters; 52 # try atleast adding c1 if we are in part1 53 if part == "@Part1": 54 try: 55 c1 = unistr(line.split(';')[0]) 56 except RangeError: 57 pass 58 else: 59 part1_data[c1] = 1 60 continue 61 62 # Perform tests 63 self.failUnless(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 64 self.failUnless(c4 == NFC(c4) == NFC(c5), line) 65 self.failUnless(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 66 self.failUnless(c5 == NFD(c4) == NFD(c5), line) 67 self.failUnless(c4 == NFKC(c1) == NFKC(c2) == \ 68 NFKC(c3) == NFKC(c4) == NFKC(c5), 69 line) 70 self.failUnless(c5 == NFKD(c1) == NFKD(c2) == \ 71 NFKD(c3) == NFKD(c4) == NFKD(c5), 72 line) 73 74 # Record part 1 data 75 if part == "@Part1": 76 part1_data[c1] = 1 77 78 # Perform tests for all other data 79 for c in range(sys.maxunicode+1): 80 X = chr(c) 81 if X in part1_data: 82 continue 83 self.failUnless(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) 84 85 def test_bug_834676(self): 86 # Check for bug 834676 87 normalize('NFC', '\ud55c\uae00') 88 89 90def test_main(): 91 # Hit the exception early 92 open_urlresource(TESTDATAURL) 93 run_unittest(NormalizationTest) 94 95if __name__ == "__main__": 96 test_main() 97