test_normalization.py revision a7e7497d885b5616fbe9bd4b16778f9e0e50b07d
1from test.support import run_unittest, open_urlresource 2import unittest 3 4from http.client import HTTPException 5import sys 6import os 7from unicodedata import normalize, unidata_version 8 9TESTDATAFILE = "NormalizationTest.txt" 10TESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE 11 12def check_version(testfile): 13 hdr = testfile.readline() 14 return unidata_version in hdr 15 16class RangeError(Exception): 17 pass 18 19def NFC(str): 20 return normalize("NFC", str) 21 22def NFKC(str): 23 return normalize("NFKC", str) 24 25def NFD(str): 26 return normalize("NFD", str) 27 28def NFKD(str): 29 return normalize("NFKD", str) 30 31def unistr(data): 32 data = [int(x, 16) for x in data.split(" ")] 33 for x in data: 34 if x > sys.maxunicode: 35 raise RangeError 36 return "".join([chr(x) for x in data]) 37 38class NormalizationTest(unittest.TestCase): 39 def test_main(self): 40 part = None 41 part1_data = {} 42 # Hit the exception early 43 try: 44 testdata = open_urlresource(TESTDATAURL, encoding="utf-8", 45 check=check_version) 46 except (OSError, HTTPException): 47 self.skipTest("Could not retrieve " + TESTDATAURL) 48 self.addCleanup(testdata.close) 49 for line in testdata: 50 if '#' in line: 51 line = line.split('#')[0] 52 line = line.strip() 53 if not line: 54 continue 55 if line.startswith("@Part"): 56 part = line.split()[0] 57 continue 58 try: 59 c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] 60 except RangeError: 61 # Skip unsupported characters; 62 # try at least adding c1 if we are in part1 63 if part == "@Part1": 64 try: 65 c1 = unistr(line.split(';')[0]) 66 except RangeError: 67 pass 68 else: 69 part1_data[c1] = 1 70 continue 71 72 # Perform tests 73 self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 74 self.assertTrue(c4 == NFC(c4) == NFC(c5), line) 75 self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 76 self.assertTrue(c5 == NFD(c4) == NFD(c5), line) 77 self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ 78 NFKC(c3) == NFKC(c4) == NFKC(c5), 79 line) 80 self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ 81 NFKD(c3) == NFKD(c4) == NFKD(c5), 82 line) 83 84 # Record part 1 data 85 if part == "@Part1": 86 part1_data[c1] = 1 87 88 # Perform tests for all other data 89 for c in range(sys.maxunicode+1): 90 X = chr(c) 91 if X in part1_data: 92 continue 93 self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) 94 95 def test_bug_834676(self): 96 # Check for bug 834676 97 normalize('NFC', '\ud55c\uae00') 98 99 100def test_main(): 101 run_unittest(NormalizationTest) 102 103if __name__ == "__main__": 104 test_main() 105