test_normalization.py revision 8844153d7f3c5ff00e844f674cd639af4ab8addc
19405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterfrom test.test_support import run_unittest, open_urlresource
29405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterimport unittest
39405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter
4677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisimport sys
51b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersimport os
64dd3a50ca480eef7bd898cfbfef8377231e18ae9Martin v. Löwisfrom unicodedata import normalize, unidata_version
71b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters
841962966056b4ba8a240574f174af212726bd9cdMartin v. LöwisTESTDATAFILE = "NormalizationTest" + os.extsep + "txt"
94dd3a50ca480eef7bd898cfbfef8377231e18ae9Martin v. LöwisTESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE
104dd3a50ca480eef7bd898cfbfef8377231e18ae9Martin v. Löwis
114dd3a50ca480eef7bd898cfbfef8377231e18ae9Martin v. Löwisif os.path.exists(TESTDATAFILE):
124dd3a50ca480eef7bd898cfbfef8377231e18ae9Martin v. Löwis    f = open(TESTDATAFILE)
134dd3a50ca480eef7bd898cfbfef8377231e18ae9Martin v. Löwis    l = f.readline()
144dd3a50ca480eef7bd898cfbfef8377231e18ae9Martin v. Löwis    f.close()
154dd3a50ca480eef7bd898cfbfef8377231e18ae9Martin v. Löwis    if not unidata_version in l:
164dd3a50ca480eef7bd898cfbfef8377231e18ae9Martin v. Löwis        os.unlink(TESTDATAFILE)
17677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
18846d72a7d7536ea6ad9b530b1a96c354fb623115Neal Norwitzclass RangeError(Exception):
19677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    pass
20677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
21677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFC(str):
22677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFC", str)
23677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
24677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKC(str):
25677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFKC", str)
26677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
27677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFD(str):
28677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFD", str)
29677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
30677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef NFKD(str):
31677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return normalize("NFKD", str)
32677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
33677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwisdef unistr(data):
34677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    data = [int(x, 16) for x in data.split(" ")]
35677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    for x in data:
36677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis        if x > sys.maxunicode:
37677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis            raise RangeError
38677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis    return u"".join([unichr(x) for x in data])
390d4c06e06e5ee1f3bb1fa8068114bd700d74864aNeal Norwitz
409405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterclass NormalizationTest(unittest.TestCase):
419405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter    def test_main(self):
429405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        part1_data = {}
438844153d7f3c5ff00e844f674cd639af4ab8addcAntoine Pitrou        # Hit the exception early
448844153d7f3c5ff00e844f674cd639af4ab8addcAntoine Pitrou        try:
458844153d7f3c5ff00e844f674cd639af4ab8addcAntoine Pitrou            open_urlresource(TESTDATAURL)
468844153d7f3c5ff00e844f674cd639af4ab8addcAntoine Pitrou        except IOError:
478844153d7f3c5ff00e844f674cd639af4ab8addcAntoine Pitrou            self.skipTest("Could not retrieve " + TESTDATAURL)
489405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        for line in open_urlresource(TESTDATAURL):
499405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if '#' in line:
509405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                line = line.split('#')[0]
519405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            line = line.strip()
529405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if not line:
539405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                continue
549405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if line.startswith("@Part"):
559405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                part = line.split()[0]
569405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                continue
579405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if part == "@Part3":
589405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                # XXX we don't support PRI #29 yet, so skip these tests for now
599405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                continue
609405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            try:
619405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
629405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            except RangeError:
639405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                # Skip unsupported characters;
649405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                # try atleast adding c1 if we are in part1
659405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                if part == "@Part1":
669405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                    try:
679405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                        c1 = unistr(line.split(';')[0])
689405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                    except RangeError:
699405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                        pass
709405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                    else:
719405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                        part1_data[c1] = 1
729405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                continue
73677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
749405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            # Perform tests
755c8da86f3a515ce1a6d5f27fd15e3c5f4d8e931eBenjamin Peterson            self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
765c8da86f3a515ce1a6d5f27fd15e3c5f4d8e931eBenjamin Peterson            self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
775c8da86f3a515ce1a6d5f27fd15e3c5f4d8e931eBenjamin Peterson            self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
785c8da86f3a515ce1a6d5f27fd15e3c5f4d8e931eBenjamin Peterson            self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
795c8da86f3a515ce1a6d5f27fd15e3c5f4d8e931eBenjamin Peterson            self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
809405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                            NFKC(c3) == NFKC(c4) == NFKC(c5),
819405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                            line)
825c8da86f3a515ce1a6d5f27fd15e3c5f4d8e931eBenjamin Peterson            self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
839405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                            NFKD(c3) == NFKD(c4) == NFKD(c5),
849405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                            line)
851b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters
869405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            # Record part 1 data
879405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if part == "@Part1":
889405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                part1_data[c1] = 1
89677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
909405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        # Perform tests for all other data
919405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        for c in range(sys.maxunicode+1):
929405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            X = unichr(c)
939405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter            if X in part1_data:
949405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter                continue
955c8da86f3a515ce1a6d5f27fd15e3c5f4d8e931eBenjamin Peterson            self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
9677c06fbf942bf4c532d0f8d6f254882a9e5957ecTim Peters
979405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter    def test_bug_834676(self):
989405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        # Check for bug 834676
999405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter        normalize('NFC', u'\ud55c\uae00')
100677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
101677bde2dd14ac2c8f170779adcc732f991db8bd6Martin v. Löwis
1029405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winterdef test_main():
1039405609c1725ea86c3cdc9a9ac665649d80d62c6Collin Winter    run_unittest(NormalizationTest)
104d2171d2ba414def2ecf27b694ea27c2e9fde0fcfMartin v. Löwis
1051b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Petersif __name__ == "__main__":
1061b445d3fcfcc06e5360e83b978efdb9b1c980278Tim Peters    test_main()
107