test_normalization.py revision 1a305fd141ebe4295d554d8b127ab982858a079e
1from test.support import run_unittest, open_urlresource 2import unittest 3 4from http.client import HTTPException 5import sys 6import os 7from unicodedata import normalize, unidata_version 8 9TESTDATAFILE = "NormalizationTest.txt" 10TESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE 11 12# Verify we have the correct version of the test data file. 13TESTDATAPATH = os.path.join(os.path.dirname(__file__), "data", TESTDATAFILE) 14if os.path.exists(TESTDATAPATH): 15 f = open(TESTDATAPATH, encoding='utf-8') 16 l = f.readline() 17 f.close() 18 if not unidata_version in l: 19 os.unlink(testdatafile) 20 21class RangeError(Exception): 22 pass 23 24def NFC(str): 25 return normalize("NFC", str) 26 27def NFKC(str): 28 return normalize("NFKC", str) 29 30def NFD(str): 31 return normalize("NFD", str) 32 33def NFKD(str): 34 return normalize("NFKD", str) 35 36def unistr(data): 37 data = [int(x, 16) for x in data.split(" ")] 38 for x in data: 39 if x > sys.maxunicode: 40 raise RangeError 41 return "".join([chr(x) for x in data]) 42 43class NormalizationTest(unittest.TestCase): 44 def test_main(self): 45 part1_data = {} 46 # Hit the exception early 47 try: 48 open_urlresource(TESTDATAURL, encoding="utf-8") 49 except (IOError, HTTPException): 50 self.skipTest("Could not retrieve " + TESTDATAURL) 51 for line in open_urlresource(TESTDATAURL, encoding="utf-8"): 52 if '#' in line: 53 line = line.split('#')[0] 54 line = line.strip() 55 if not line: 56 continue 57 if line.startswith("@Part"): 58 part = line.split()[0] 59 continue 60 if part == "@Part3": 61 # XXX we don't support PRI #29 yet, so skip these tests for now 62 continue 63 try: 64 c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] 65 except RangeError: 66 # Skip unsupported characters; 67 # try atleast adding c1 if we are in part1 68 if part == "@Part1": 69 try: 70 c1 = unistr(line.split(';')[0]) 71 except RangeError: 72 pass 73 else: 74 part1_data[c1] = 1 75 continue 76 77 # Perform tests 78 self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 79 self.assertTrue(c4 == NFC(c4) == NFC(c5), line) 80 self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 81 self.assertTrue(c5 == NFD(c4) == NFD(c5), line) 82 self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ 83 NFKC(c3) == NFKC(c4) == NFKC(c5), 84 line) 85 self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ 86 NFKD(c3) == NFKD(c4) == NFKD(c5), 87 line) 88 89 # Record part 1 data 90 if part == "@Part1": 91 part1_data[c1] = 1 92 93 # Perform tests for all other data 94 for c in range(sys.maxunicode+1): 95 X = chr(c) 96 if X in part1_data: 97 continue 98 self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) 99 100 def test_bug_834676(self): 101 # Check for bug 834676 102 normalize('NFC', '\ud55c\uae00') 103 104 105def test_main(): 106 run_unittest(NormalizationTest) 107 108if __name__ == "__main__": 109 test_main() 110