test_normalization.py revision 9405609c1725ea86c3cdc9a9ac665649d80d62c6
19405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterfrom test.test_support import run_unittest, open_urlresource
29405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterimport unittest
39405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter
4677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisimport sys
51b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersimport os
6677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisfrom unicodedata import normalize
71b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters
841962966056b4ba8a240574f174af212726bd9cdMartin v. LöwisTESTDATAFILE = "NormalizationTest" + os.extsep + "txt"
941962966056b4ba8a240574f174af212726bd9cdMartin v. LöwisTESTDATAURL = "http://www.unicode.org/Public/4.1.0/ucd/" + TESTDATAFILE
10677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
11846d72a7d7536ea6ad9b530b1a96c354fb623115Neal Norwitzclass RangeError(Exception):
12677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    pass
13677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
14677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFC(str):
15677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFC", str)
16677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
17677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKC(str):
18677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFKC", str)
19677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
20677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFD(str):
21677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFD", str)
22677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
23677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKD(str):
24677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFKD", str)
25677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
26677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef unistr(data):
27677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    data = [int(x, 16) for x in data.split(" ")]
28677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    for x in data:
29677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        if x > sys.maxunicode:
30677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis            raise RangeError
31677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return u"".join([unichr(x) for x in data])
329405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter
339405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterclass NormalizationTest(unittest.TestCase):
349405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter    def test_main(self):
359405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        part1_data = {}
369405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        for line in open_urlresource(TESTDATAURL):
379405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if '#' in line:
389405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                line = line.split('#')[0]
399405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            line = line.strip()
409405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if not line:
419405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                continue
429405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if line.startswith("@Part"):
439405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                part = line.split()[0]
449405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                continue
459405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if part == "@Part3":
469405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                # XXX we don't support PRI #29 yet, so skip these tests for now
479405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                continue
489405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            try:
499405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
509405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            except RangeError:
519405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                # Skip unsupported characters;
529405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                # try atleast adding c1 if we are in part1
539405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                if part == "@Part1":
549405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                    try:
559405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                        c1 = unistr(line.split(';')[0])
569405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                    except RangeError:
579405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                        pass
589405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                    else:
599405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                        part1_data[c1] = 1
609405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                continue
61677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
629405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            # Perform tests
639405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            self.failUnless(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
649405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            self.failUnless(c4 ==  NFC(c4) ==  NFC(c5), line)
659405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            self.failUnless(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
669405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            self.failUnless(c5 ==  NFD(c4) ==  NFD(c5), line)
679405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            self.failUnless(c4 == NFKC(c1) == NFKC(c2) == \
689405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                            NFKC(c3) == NFKC(c4) == NFKC(c5),
699405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                            line)
709405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            self.failUnless(c5 == NFKD(c1) == NFKD(c2) == \
719405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                            NFKD(c3) == NFKD(c4) == NFKD(c5),
729405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                            line)
731b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters
749405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            # Record part 1 data
759405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if part == "@Part1":
769405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                part1_data[c1] = 1
77677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
789405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        # Perform tests for all other data
799405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        for c in range(sys.maxunicode+1):
809405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            X = unichr(c)
819405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if X in part1_data:
829405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                continue
839405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            self.failUnless(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
8477c06fbf942bf4c532d0f8d6f254882a9e5957ecTim Peters
859405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter    def test_bug_834676(self):
869405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        # Check for bug 834676
879405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        normalize('NFC', u'\ud55c\uae00')
88677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
89677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
909405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterdef test_main():
919405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter    run_unittest(NormalizationTest)
92d2171d2ba414def2ecf27b694ea27c2e9fde0fcfMartin v. Löwis
931b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersif __name__ == "__main__":
941b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    test_main()
95