test_normalization.py revision 846d72a7d7536ea6ad9b530b1a96c354fb623115
1aaa2f1dea706daf2a5f431d97a3e3120dba652d2Hye-Shik Changfrom test.test_support import (verbose, TestFailed, TestSkipped, verify, 2aaa2f1dea706daf2a5f431d97a3e3120dba652d2Hye-Shik Chang open_urlresource) 3677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisimport sys 41b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersimport os 5677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfrom unicodedata import normalize 61b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters 741962966056b4ba8a240574f174af212726bd9cdMartin v. LöwisTESTDATAFILE = "NormalizationTest" + os.extsep + "txt" 841962966056b4ba8a240574f174af212726bd9cdMartin v. LöwisTESTDATAURL = "http://www.unicode.org/Public/4.1.0/ucd/" + TESTDATAFILE 9677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 10846d72a7d7536ea6ad9b530b1a96c354fb623115Neal Norwitzclass RangeError(Exception): 11677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis pass 12677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 13677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFC(str): 14677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFC", str) 15677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 16677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKC(str): 17677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFKC", str) 18677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 19677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFD(str): 20677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFD", str) 21677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 22677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKD(str): 23677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return normalize("NFKD", str) 24677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 25677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef unistr(data): 26677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis data = [int(x, 16) for x in data.split(" ")] 27677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis for x in data: 28677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis if x > sys.maxunicode: 29677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis raise RangeError 30677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis return u"".join([unichr(x) for x in data]) 31677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 321b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersdef test_main(): 331b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters part1_data = {} 34aaa2f1dea706daf2a5f431d97a3e3120dba652d2Hye-Shik Chang for line in open_urlresource(TESTDATAURL): 351b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if '#' in line: 361b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters line = line.split('#')[0] 371b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters line = line.strip() 381b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if not line: 391b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters continue 401b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if line.startswith("@Part"): 4141962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis part = line.split()[0] 4241962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis continue 4341962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis if part == "@Part3": 4441962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis # XXX we don't support PRI #29 yet, so skip these tests for now 451b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters continue 461b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters try: 471b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]] 481b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters except RangeError: 4988ca467ca4b1f13a9fb172712cf25eeae94e3095Tim Peters # Skip unsupported characters; 5041962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis # try atleast adding c1 if we are in part1 5141962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis if part == "@Part1": 5288ca467ca4b1f13a9fb172712cf25eeae94e3095Tim Peters try: 5341962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis c1=unistr(line.split(';')[0]) 5441962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis except RangeError: 5541962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis pass 5641962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis else: 5741962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis part1_data[c1] = 1 581b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters continue 591b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters 601b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if verbose: 611b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters print line 62677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 631b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters # Perform tests 641b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) 651b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c4 == NFC(c4) == NFC(c5), line) 661b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) 671b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c5 == NFD(c4) == NFD(c5), line) 681b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), 691b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters line) 701b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), 711b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters line) 7277c06fbf942bf4c532d0f8d6f254882a9e5957ecTim Peters 731b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters # Record part 1 data 741b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if part == "@Part1": 751b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters part1_data[c1] = 1 76677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 771b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters # Perform tests for all other data 781b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters for c in range(sys.maxunicode+1): 791b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters X = unichr(c) 801b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters if X in part1_data: 811b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters continue 821b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c 83677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis 84d2171d2ba414def2ecf27b694ea27c2e9fde0fcfMartin v. Löwis # Check for bug 834676 85d2171d2ba414def2ecf27b694ea27c2e9fde0fcfMartin v. Löwis normalize('NFC',u'\ud55c\uae00') 86d2171d2ba414def2ecf27b694ea27c2e9fde0fcfMartin v. Löwis 871b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersif __name__ == "__main__": 881b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters test_main() 89