12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson# Copyright 2008 The RE2 Authors.  All Rights Reserved.
22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson# Use of this source code is governed by a BSD-style
32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson# license that can be found in the LICENSE file.
42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson"""Parser for Unicode data files (as distributed by unicode.org)."""
62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonimport os
82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonimport re
92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonimport urllib2
102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson# Directory or URL where Unicode tables reside.
122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson_UNICODE_DIR = "http://www.unicode.org/Public/6.0.0/ucd"
132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson# Largest valid Unicode code value.
152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson_RUNE_MAX = 0x10FFFF
162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Error(Exception):
192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """Unicode error base class."""
202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass InputError(Error):
232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """Unicode input error class.  Raised on invalid input."""
242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef _UInt(s):
272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """Converts string to Unicode code point ('263A' => 0x263a).
282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Args:
302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    s: string to convert
312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Returns:
332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Unicode code point
342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Raises:
362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    InputError: the string is not a valid Unicode value.
372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """
382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  try:
402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    v = int(s, 16)
412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  except ValueError:
422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    v = -1
432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    raise InputError("invalid Unicode value %s" % (s,))
452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return v
462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef _URange(s):
492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """Converts string to Unicode range.
502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    '0001..0003' => [1, 2, 3].
522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    '0001' => [1].
532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Args:
552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    s: string to convert
562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Returns:
582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Unicode range
592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Raises:
612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    InputError: the string is not a valid Unicode range.
622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """
632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  a = s.split("..")
642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if len(a) == 1:
652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return [_UInt(a[0])]
662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if len(a) == 2:
672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    lo = _UInt(a[0])
682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    hi = _UInt(a[1])
692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if lo < hi:
702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return range(lo, hi + 1)
712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  raise InputError("invalid Unicode range %s" % (s,))
722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef _UStr(v):
752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """Converts Unicode code point to hex string.
762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    0x263a => '0x263A'.
782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Args:
802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    v: code point to convert
812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Returns:
832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Unicode string
842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Raises:
862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    InputError: the argument is not a valid Unicode value.
872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """
882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if v < 0 or v > _RUNE_MAX:
892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    raise InputError("invalid Unicode value %s" % (v,))
902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return "0x%04X" % (v,)
912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef _ParseContinue(s):
942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """Parses a Unicode continuation field.
952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  These are of the form '<Name, First>' or '<Name, Last>'.
972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Instead of giving an explicit range in a single table entry,
982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  some Unicode tables use two entries, one for the first
992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  code value in the range and one for the last.
1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  The first entry's description is '<Name, First>' instead of 'Name'
1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  and the second is '<Name, Last>'.
1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    '<Name, First>' => ('Name', 'First')
1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    '<Name, Last>' => ('Name', 'Last')
1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    'Anything else' => ('Anything else', None)
1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Args:
1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    s: continuation field string
1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Returns:
1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    pair: name and ('First', 'Last', or None)
1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """
1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  match = re.match("<(.*), (First|Last)>", s)
1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if match is not None:
1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return match.groups()
1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return (s, None)
1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef ReadUnicodeTable(filename, nfields, doline):
1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """Generic Unicode table text file reader.
1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  The reader takes care of stripping out comments and also
1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  parsing the two different ways that the Unicode tables specify
1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  code ranges (using the .. notation and splitting the range across
1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  multiple lines).
1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Each non-comment line in the table is expected to have the given
1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  number of fields.  The first field is known to be the Unicode value
1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  and the second field its description.
1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  The reader calls doline(codes, fields) for each entry in the table.
1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  If fn raises an exception, the reader prints that exception,
1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  prefixed with the file name and line number, and continues
1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  processing the file.  When done with the file, the reader re-raises
1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  the first exception encountered during the file.
1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Arguments:
1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    filename: the Unicode data file to read, or a file-like object.
1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    nfields: the number of expected fields per line in that file.
1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    doline: the function to call for each table entry.
1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Raises:
1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    InputError: nfields is invalid (must be >= 2).
1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """
1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if nfields < 2:
1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    raise InputError("invalid number of fields %d" % (nfields,))
1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if type(filename) == str:
1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if filename.startswith("http://"):
1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      fil = urllib2.urlopen(filename)
1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    else:
1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      fil = open(filename, "r")
1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  else:
1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    fil = filename
1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  first = None        # first code in multiline range
1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  expect_last = None  # tag expected for "Last" line in multiline range
1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  lineno = 0          # current line number
1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for line in fil:
1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    lineno += 1
1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    try:
1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      # Chop # comments and white space; ignore empty lines.
1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      sharp = line.find("#")
1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if sharp >= 0:
1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        line = line[:sharp]
1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      line = line.strip()
1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if not line:
1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        continue
1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      # Split fields on ";", chop more white space.
1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      # Must have the expected number of fields.
1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      fields = [s.strip() for s in line.split(";")]
1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if len(fields) != nfields:
1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        raise InputError("wrong number of fields %d %d - %s" %
1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                         (len(fields), nfields, line))
1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      # The Unicode text files have two different ways
1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      # to list a Unicode range.  Either the first field is
1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      # itself a range (0000..FFFF), or the range is split
1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      # across two lines, with the second field noting
1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      # the continuation.
1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      codes = _URange(fields[0])
1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      (name, cont) = _ParseContinue(fields[1])
1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if expect_last is not None:
1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        # If the last line gave the First code in a range,
1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        # this one had better give the Last one.
1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (len(codes) != 1 or codes[0] <= first or
1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            cont != "Last" or name != expect_last):
1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          raise InputError("expected Last line for %s" %
1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                           (expect_last,))
1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        codes = range(first, codes[0] + 1)
1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        first = None
1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        expect_last = None
1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        fields[0] = "%04X..%04X" % (codes[0], codes[-1])
1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        fields[1] = name
1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      elif cont == "First":
2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        # Otherwise, if this is the First code in a range,
2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        # remember it and go to the next line.
2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if len(codes) != 1:
2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          raise InputError("bad First line: range given")
2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        expect_last = name
2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        first = codes[0]
2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        continue
2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      doline(codes, fields)
2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    except Exception, e:
2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      print "%s:%d: %s" % (filename, lineno, e)
2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      raise
2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if expect_last is not None:
2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    raise InputError("expected Last line for %s; got EOF" %
2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                     (expect_last,))
2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef CaseGroups(unicode_dir=_UNICODE_DIR):
2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """Returns list of Unicode code groups equivalent under case folding.
2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Each group is a sorted list of code points,
2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  and the list of groups is sorted by first code point
2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  in the group.
2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Args:
2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    unicode_dir: Unicode data directory
2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Returns:
2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    list of Unicode code groups
2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """
2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  # Dict mapping lowercase code point to fold-equivalent group.
2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  togroup = {}
2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  def DoLine(codes, fields):
2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    """Process single CaseFolding.txt line, updating togroup."""
2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    (_, foldtype, lower, _) = fields
2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if foldtype not in ("C", "S"):
2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return
2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    lower = _UInt(lower)
2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    togroup.setdefault(lower, [lower]).extend(codes)
2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  groups = togroup.values()
2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for g in groups:
2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    g.sort()
2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  groups.sort()
2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return togroup, groups
2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef Scripts(unicode_dir=_UNICODE_DIR):
2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """Returns dict mapping script names to code lists.
2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Args:
2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    unicode_dir: Unicode data directory
2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Returns:
2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    dict mapping script names to code lists
2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """
2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  scripts = {}
2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  def DoLine(codes, fields):
2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    """Process single Scripts.txt line, updating scripts."""
2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    (_, name) = fields
2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    scripts.setdefault(name, []).extend(codes)
2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return scripts
2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsondef Categories(unicode_dir=_UNICODE_DIR):
2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """Returns dict mapping category names to code lists.
2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Args:
2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    unicode_dir: Unicode data directory
2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Returns:
2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    dict mapping category names to code lists
2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  """
2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  categories = {}
2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  def DoLine(codes, fields):
2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    """Process single UnicodeData.txt line, updating categories."""
2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    category = fields[2]
2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    categories.setdefault(category, []).extend(codes)
2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    # Add codes from Lu into L, etc.
2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if len(category) > 1:
2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      short = category[0]
2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      categories.setdefault(short, []).extend(codes)
2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return categories
2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
298