test_normalization.py revision 9405609c1725ea86c3cdc9a9ac665649d80d62c6
19405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterfrom test.test_support import run_unittest, open_urlresource 29405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterimport unittest 39405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter 4677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisimport sys 51b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersimport os 6677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfrom unicodedata import normalize 71b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters 841962966056b4ba8a240574f174af212726bd9cdMartin v. LöwisTESTDATAFILE = "NormalizationTest" + os.extsep + "txt" 941962966056b4ba8a240574f174af212726bd9cdMartin v. LöwisTESTDATAURL = "http://www.unicode.org/Public/4.1.0/ucd/" + TESTDATAFILE 10677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 11846d72a7d7536ea6ad9b530b1a96c354fb623115Neal Norwitzclass RangeError(Exception): 12677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis pass 13677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 14677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFC(str): 15677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFC", str) 16677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 17677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKC(str): 18677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFKC", str) 19677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 20677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFD(str): 21677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFD", str) 22677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 23677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKD(str): 24677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFKD", str) 25677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 26677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef unistr(data): 27677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis data = [int(x, 16) for x in data.split(" ")] 28677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis for x in data: 29677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis if x > sys.maxunicode: 30677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis raise RangeError 31677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return u"".join([unichr(x) for x in data]) 329405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter 339405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterclass NormalizationTest(unittest.TestCase): 349405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter def test_main(self): 359405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter part1_data = {} 369405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter for line in open_urlresource(TESTDATAURL): 379405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter if '#' in line: 389405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter line = line.split('#')[0] 399405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter line = line.strip() 409405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter if not line: 419405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter continue 429405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter if line.startswith("@Part"): 439405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter part = line.split()[0] 449405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter continue 459405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter if part == "@Part3": 469405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter # XXX we don't support PRI #29 yet, so skip these tests for now 479405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter continue 489405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter try: 499405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] 509405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter except RangeError: 519405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter # Skip unsupported characters; 529405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter # try atleast adding c1 if we are in part1 539405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter if part == "@Part1": 549405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter try: 559405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter c1 = unistr(line.split(';')[0]) 569405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter except RangeError: 579405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter pass 589405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter else: 599405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter part1_data[c1] = 1 609405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter continue 61677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 629405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter # Perform tests 639405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter self.failUnless(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 649405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter self.failUnless(c4 == NFC(c4) == NFC(c5), line) 659405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter self.failUnless(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 669405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter self.failUnless(c5 == NFD(c4) == NFD(c5), line) 679405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter self.failUnless(c4 == NFKC(c1) == NFKC(c2) == \ 689405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter NFKC(c3) == NFKC(c4) == NFKC(c5), 699405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter line) 709405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter self.failUnless(c5 == NFKD(c1) == NFKD(c2) == \ 719405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter NFKD(c3) == NFKD(c4) == NFKD(c5), 729405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter line) 731b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters 749405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter # Record part 1 data 759405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter if part == "@Part1": 769405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter part1_data[c1] = 1 77677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 789405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter # Perform tests for all other data 799405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter for c in range(sys.maxunicode+1): 809405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter X = unichr(c) 819405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter if X in part1_data: 829405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter continue 839405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter self.failUnless(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) 8477c06fbf942bf4c532d0f8d6f254882a9e5957ecTim Peters 859405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter def test_bug_834676(self): 869405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter # Check for bug 834676 879405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter normalize('NFC', u'\ud55c\uae00') 88677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 89677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 909405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterdef test_main(): 919405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter run_unittest(NormalizationTest) 92d2171d2ba414def2ecf27b694ea27c2e9fde0fcfMartin v. Löwis 931b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersif __name__ == "__main__": 941b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters test_main() 95