test_normalization.py revision 846d72a7d7536ea6ad9b530b1a96c354fb623115
1aaa2f1dea706daf2a5f431d97a3e3120dba652d2Hye-Shik Changfrom test.test_support import (verbose, TestFailed, TestSkipped, verify,
2aaa2f1dea706daf2a5f431d97a3e3120dba652d2Hye-Shik Chang                               open_urlresource)
3677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisimport sys
41b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersimport os
5677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfrom unicodedata import normalize
61b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters
741962966056b4ba8a240574f174af212726bd9cdMartin v. LöwisTESTDATAFILE = "NormalizationTest" + os.extsep + "txt"
841962966056b4ba8a240574f174af212726bd9cdMartin v. LöwisTESTDATAURL = "http://www.unicode.org/Public/4.1.0/ucd/" + TESTDATAFILE
9677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
10846d72a7d7536ea6ad9b530b1a96c354fb623115Neal Norwitzclass RangeError(Exception):
11677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    pass
12677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
13677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFC(str):
14677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFC", str)
15677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
16677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKC(str):
17677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFKC", str)
18677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
19677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFD(str):
20677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFD", str)
21677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
22677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKD(str):
23677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFKD", str)
24677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
25677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef unistr(data):
26677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    data = [int(x, 16) for x in data.split(" ")]
27677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    for x in data:
28677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        if x > sys.maxunicode:
29677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis            raise RangeError
30677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return u"".join([unichr(x) for x in data])
31677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
321b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersdef test_main():
331b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    part1_data = {}
34aaa2f1dea706daf2a5f431d97a3e3120dba652d2Hye-Shik Chang    for line in open_urlresource(TESTDATAURL):
351b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if '#' in line:
361b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            line = line.split('#')[0]
371b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        line = line.strip()
381b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if not line:
391b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            continue
401b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if line.startswith("@Part"):
4141962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis            part = line.split()[0]
4241962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis            continue
4341962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis        if part == "@Part3":
4441962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis            # XXX we don't support PRI #29 yet, so skip these tests for now
451b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            continue
461b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        try:
471b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
481b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        except RangeError:
4988ca467ca4b1f13a9fb172712cf25eeae94e3095Tim Peters            # Skip unsupported characters;
5041962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis            # try atleast adding c1 if we are in part1
5141962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis            if part == "@Part1":
5288ca467ca4b1f13a9fb172712cf25eeae94e3095Tim Peters                try:
5341962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis                    c1=unistr(line.split(';')[0])
5441962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis                except RangeError:
5541962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis                    pass
5641962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis                else:
5741962966056b4ba8a240574f174af212726bd9cdMartin v. Löwis                    part1_data[c1] = 1
581b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            continue
591b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters
601b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if verbose:
611b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            print line
62677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
631b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        # Perform tests
641b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
651b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c4 ==  NFC(c4) ==  NFC(c5), line)
661b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
671b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c5 ==  NFD(c4) ==  NFD(c5), line)
681b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5),
691b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters               line)
701b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5),
711b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters               line)
7277c06fbf942bf4c532d0f8d6f254882a9e5957ecTim Peters
731b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        # Record part 1 data
741b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if part == "@Part1":
751b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            part1_data[c1] = 1
76677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
771b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    # Perform tests for all other data
781b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    for c in range(sys.maxunicode+1):
791b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        X = unichr(c)
801b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        if X in part1_data:
811b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters            continue
821b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters        assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c
83677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
84d2171d2ba414def2ecf27b694ea27c2e9fde0fcfMartin v. Löwis    # Check for bug 834676
85d2171d2ba414def2ecf27b694ea27c2e9fde0fcfMartin v. Löwis    normalize('NFC',u'\ud55c\uae00')
86d2171d2ba414def2ecf27b694ea27c2e9fde0fcfMartin v. Löwis
871b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersif __name__ == "__main__":
881b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    test_main()
89