12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson# Copyright 2008 The RE2 Authors. All Rights Reserved. 22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson# Use of this source code is governed by a BSD-style 32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson# license that can be found in the LICENSE file. 42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson"""Parser for Unicode data files (as distributed by unicode.org).""" 62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonimport os 82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonimport re 92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonimport urllib2 102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson# Directory or URL where Unicode tables reside. 122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson_UNICODE_DIR = "http://www.unicode.org/Public/6.0.0/ucd" 132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson# Largest valid Unicode code value. 152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson_RUNE_MAX = 0x10FFFF 162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Error(Exception): 192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Unicode error base class.""" 202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass InputError(Error): 232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Unicode input error class. Raised on invalid input.""" 242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef _UInt(s): 272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Converts string to Unicode code point ('263A' => 0x263a). 282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Args: 302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s: string to convert 312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Returns: 332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Unicode code point 342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Raises: 362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson InputError: the string is not a valid Unicode value. 372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """ 382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson try: 402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson v = int(s, 16) 412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson except ValueError: 422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson v = -1 432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX: 442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson raise InputError("invalid Unicode value %s" % (s,)) 452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return v 462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef _URange(s): 492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Converts string to Unicode range. 502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson '0001..0003' => [1, 2, 3]. 522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson '0001' => [1]. 532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Args: 552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s: string to convert 562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Returns: 582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Unicode range 592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Raises: 612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson InputError: the string is not a valid Unicode range. 622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """ 632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson a = s.split("..") 642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if len(a) == 1: 652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return [_UInt(a[0])] 662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if len(a) == 2: 672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lo = _UInt(a[0]) 682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson hi = _UInt(a[1]) 692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if lo < hi: 702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return range(lo, hi + 1) 712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson raise InputError("invalid Unicode range %s" % (s,)) 722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef _UStr(v): 752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Converts Unicode code point to hex string. 762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 0x263a => '0x263A'. 782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Args: 802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson v: code point to convert 812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Returns: 832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Unicode string 842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Raises: 862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson InputError: the argument is not a valid Unicode value. 872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """ 882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if v < 0 or v > _RUNE_MAX: 892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson raise InputError("invalid Unicode value %s" % (v,)) 902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return "0x%04X" % (v,) 912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef _ParseContinue(s): 942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Parses a Unicode continuation field. 952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson These are of the form '<Name, First>' or '<Name, Last>'. 972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Instead of giving an explicit range in a single table entry, 982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson some Unicode tables use two entries, one for the first 992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson code value in the range and one for the last. 1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson The first entry's description is '<Name, First>' instead of 'Name' 1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson and the second is '<Name, Last>'. 1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson '<Name, First>' => ('Name', 'First') 1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson '<Name, Last>' => ('Name', 'Last') 1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 'Anything else' => ('Anything else', None) 1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Args: 1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s: continuation field string 1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Returns: 1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson pair: name and ('First', 'Last', or None) 1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """ 1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson match = re.match("<(.*), (First|Last)>", s) 1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if match is not None: 1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return match.groups() 1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return (s, None) 1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef ReadUnicodeTable(filename, nfields, doline): 1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Generic Unicode table text file reader. 1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson The reader takes care of stripping out comments and also 1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson parsing the two different ways that the Unicode tables specify 1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson code ranges (using the .. notation and splitting the range across 1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson multiple lines). 1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Each non-comment line in the table is expected to have the given 1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson number of fields. The first field is known to be the Unicode value 1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson and the second field its description. 1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson The reader calls doline(codes, fields) for each entry in the table. 1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson If fn raises an exception, the reader prints that exception, 1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prefixed with the file name and line number, and continues 1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson processing the file. When done with the file, the reader re-raises 1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson the first exception encountered during the file. 1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Arguments: 1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson filename: the Unicode data file to read, or a file-like object. 1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nfields: the number of expected fields per line in that file. 1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson doline: the function to call for each table entry. 1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Raises: 1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson InputError: nfields is invalid (must be >= 2). 1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """ 1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if nfields < 2: 1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson raise InputError("invalid number of fields %d" % (nfields,)) 1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if type(filename) == str: 1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if filename.startswith("http://"): 1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fil = urllib2.urlopen(filename) 1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else: 1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fil = open(filename, "r") 1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else: 1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fil = filename 1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson first = None # first code in multiline range 1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson expect_last = None # tag expected for "Last" line in multiline range 1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lineno = 0 # current line number 1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for line in fil: 1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lineno += 1 1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson try: 1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # Chop # comments and white space; ignore empty lines. 1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sharp = line.find("#") 1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if sharp >= 0: 1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson line = line[:sharp] 1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson line = line.strip() 1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if not line: 1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue 1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # Split fields on ";", chop more white space. 1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # Must have the expected number of fields. 1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fields = [s.strip() for s in line.split(";")] 1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if len(fields) != nfields: 1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson raise InputError("wrong number of fields %d %d - %s" % 1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (len(fields), nfields, line)) 1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # The Unicode text files have two different ways 1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # to list a Unicode range. Either the first field is 1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # itself a range (0000..FFFF), or the range is split 1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # across two lines, with the second field noting 1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # the continuation. 1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson codes = _URange(fields[0]) 1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (name, cont) = _ParseContinue(fields[1]) 1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if expect_last is not None: 1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # If the last line gave the First code in a range, 1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # this one had better give the Last one. 1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (len(codes) != 1 or codes[0] <= first or 1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cont != "Last" or name != expect_last): 1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson raise InputError("expected Last line for %s" % 1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (expect_last,)) 1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson codes = range(first, codes[0] + 1) 1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson first = None 1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson expect_last = None 1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fields[0] = "%04X..%04X" % (codes[0], codes[-1]) 1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fields[1] = name 1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson elif cont == "First": 2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # Otherwise, if this is the First code in a range, 2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # remember it and go to the next line. 2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if len(codes) != 1: 2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson raise InputError("bad First line: range given") 2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson expect_last = name 2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson first = codes[0] 2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue 2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson doline(codes, fields) 2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson except Exception, e: 2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson print "%s:%d: %s" % (filename, lineno, e) 2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson raise 2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if expect_last is not None: 2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson raise InputError("expected Last line for %s; got EOF" % 2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (expect_last,)) 2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef CaseGroups(unicode_dir=_UNICODE_DIR): 2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Returns list of Unicode code groups equivalent under case folding. 2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Each group is a sorted list of code points, 2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson and the list of groups is sorted by first code point 2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson in the group. 2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Args: 2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson unicode_dir: Unicode data directory 2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Returns: 2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson list of Unicode code groups 2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """ 2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # Dict mapping lowercase code point to fold-equivalent group. 2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson togroup = {} 2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson def DoLine(codes, fields): 2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Process single CaseFolding.txt line, updating togroup.""" 2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (_, foldtype, lower, _) = fields 2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if foldtype not in ("C", "S"): 2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return 2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lower = _UInt(lower) 2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson togroup.setdefault(lower, [lower]).extend(codes) 2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine) 2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson groups = togroup.values() 2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for g in groups: 2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson g.sort() 2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson groups.sort() 2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return togroup, groups 2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef Scripts(unicode_dir=_UNICODE_DIR): 2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Returns dict mapping script names to code lists. 2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Args: 2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson unicode_dir: Unicode data directory 2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Returns: 2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dict mapping script names to code lists 2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """ 2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson scripts = {} 2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson def DoLine(codes, fields): 2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Process single Scripts.txt line, updating scripts.""" 2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (_, name) = fields 2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson scripts.setdefault(name, []).extend(codes) 2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine) 2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return scripts 2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef Categories(unicode_dir=_UNICODE_DIR): 2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Returns dict mapping category names to code lists. 2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Args: 2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson unicode_dir: Unicode data directory 2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Returns: 2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dict mapping category names to code lists 2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """ 2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson categories = {} 2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson def DoLine(codes, fields): 2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson """Process single UnicodeData.txt line, updating categories.""" 2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson category = fields[2] 2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson categories.setdefault(category, []).extend(codes) 2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson # Add codes from Lu into L, etc. 2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if len(category) > 1: 2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson short = category[0] 2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson categories.setdefault(short, []).extend(codes) 2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine) 2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return categories 2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 298