test_normalization.py revision 7e05e7d3b64b3874b8086e607601edd0f46666bd
1from test.support import run_unittest, open_urlresource 2import unittest 3 4import sys 5import os 6from unicodedata import normalize, unidata_version 7 8TESTDATAFILE = "NormalizationTest.txt" 9TESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE 10 11# Verify we have the correct version of the test data file. 12TESTDATAPATH = os.path.join(os.path.dirname(__file__), "data", TESTDATAFILE) 13if os.path.exists(TESTDATAPATH): 14 f = open(TESTDATAPATH, encoding='utf-8') 15 l = f.readline() 16 f.close() 17 if not unidata_version in l: 18 os.unlink(testdatafile) 19 20class RangeError(Exception): 21 pass 22 23def NFC(str): 24 return normalize("NFC", str) 25 26def NFKC(str): 27 return normalize("NFKC", str) 28 29def NFD(str): 30 return normalize("NFD", str) 31 32def NFKD(str): 33 return normalize("NFKD", str) 34 35def unistr(data): 36 data = [int(x, 16) for x in data.split(" ")] 37 for x in data: 38 if x > sys.maxunicode: 39 raise RangeError 40 return "".join([chr(x) for x in data]) 41 42class NormalizationTest(unittest.TestCase): 43 def test_main(self): 44 part1_data = {} 45 # Hit the exception early 46 try: 47 open_urlresource(TESTDATAURL, encoding="utf-8") 48 except IOError: 49 self.skipTest("Could not retrieve " + TESTDATAURL) 50 for line in open_urlresource(TESTDATAURL, encoding="utf-8"): 51 if '#' in line: 52 line = line.split('#')[0] 53 line = line.strip() 54 if not line: 55 continue 56 if line.startswith("@Part"): 57 part = line.split()[0] 58 continue 59 if part == "@Part3": 60 # XXX we don't support PRI #29 yet, so skip these tests for now 61 continue 62 try: 63 c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] 64 except RangeError: 65 # Skip unsupported characters; 66 # try atleast adding c1 if we are in part1 67 if part == "@Part1": 68 try: 69 c1 = unistr(line.split(';')[0]) 70 except RangeError: 71 pass 72 else: 73 part1_data[c1] = 1 74 continue 75 76 # Perform tests 77 self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 78 self.assertTrue(c4 == NFC(c4) == NFC(c5), line) 79 self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 80 self.assertTrue(c5 == NFD(c4) == NFD(c5), line) 81 self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ 82 NFKC(c3) == NFKC(c4) == NFKC(c5), 83 line) 84 self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ 85 NFKD(c3) == NFKD(c4) == NFKD(c5), 86 line) 87 88 # Record part 1 data 89 if part == "@Part1": 90 part1_data[c1] = 1 91 92 # Perform tests for all other data 93 for c in range(sys.maxunicode+1): 94 X = chr(c) 95 if X in part1_data: 96 continue 97 self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) 98 99 def test_bug_834676(self): 100 # Check for bug 834676 101 normalize('NFC', '\ud55c\uae00') 102 103 104def test_main(): 105 run_unittest(NormalizationTest) 106 107if __name__ == "__main__": 108 test_main() 109