genprops/misc/ucdcopy.py

#!/usr/bin/python2.4
# Copyright (c) 2009 International Business Machines
# Corporation and others. All Rights Reserved.
#
#   file name:  ucdcopy.py
#   encoding:   US-ASCII
#   tab size:   8 (not used)
#   indentation:4
#
#   created on: 2009aug04
#   created by: Markus W. Scherer
#
# Copy Unicode Character Database (ucd) files from a tree
# of files downloaded from ftp://www.unicode.org/Public/5.2.0/
# to a folder like ICU's source/data/unidata/
# and modify some of the files to make them more compact.
#
# Invoke with two command-line parameters, for the source
# and destination folders.

import os
import os.path
import re
import shutil
import sys

_strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*")
_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")

def CopyAndStripWithOptionalMerge(s, t, do_merge):
  in_file = open(s, "r")
  out_file = open(t, "w")
  first = -1  # First code point with first_data.
  last = -1  # Last code point with first_data.
  first_data = ""  # Common data for code points [first..last].
  for line in in_file:
    match = _strip_re.match(line)
    if match:
      line = match.group(1)
    else:
      line = line.rstrip()
    if do_merge:
      match = _code_point_re.match(line)
      if match:
        c = int(match.group(1), 16)
        data = line[match.end() - 1:]
      else:
        c = -1
        data = ""
      if last >= 0 and (c != (last + 1) or data != first_data):
        # output the current range
        if first == last:
          out_file.write("%04X%s\n" % (first, first_data))
        else:
          out_file.write("%04X..%04X%s\n" % (first, last, first_data))
        first = -1
        last = -1
        first_data = ""
      if c < 0:
        # no data on this line, output as is
        out_file.write(line)
        out_file.write("\n")
      else:
        # data on this line, store for possible range compaction
        if last < 0:
          # set as the first line in a possible range
          first = c
          last = c
          first_data = data
        else:
          # must be c == (last + 1) and data == first_data
          # because of previous conditions
          # continue with the current range
          last = c
    else:
      # Only strip, don't merge: just output the stripped line.
      out_file.write(line)
      out_file.write("\n")
  if do_merge and last >= 0:
    # output the last range in the file
    if first == last:
      out_file.write("%04X%s\n" % (first, first_data))
    else:
      out_file.write("%04X..%04X%s\n" % (first, last, first_data))
    first = -1
    last = -1
    first_data = ""
  in_file.close()
  out_file.flush()
  out_file.close()


def CopyAndStrip(s, t):
  """Copies a file and removes comments behind data lines but not in others."""
  CopyAndStripWithOptionalMerge(s, t, False)


def CopyAndStripAndMerge(s, t):
  """Copies and strips a file and merges lines.

  Copies a file, removes comments, and
  merges lines with adjacent code point ranges and identical per-code point
  data lines into one line with range syntax.
  """
  CopyAndStripWithOptionalMerge(s, t, True)


_unidata_files = {
  # Simply copy these files.
  "BidiMirroring.txt": shutil.copy,
  "BidiTest.txt": shutil.copy,
  "Blocks.txt": shutil.copy,
  "CaseFolding.txt": shutil.copy,
  "DerivedAge.txt": shutil.copy,
  "DerivedBidiClass.txt": shutil.copy,
  "DerivedJoiningGroup.txt": shutil.copy,
  "DerivedJoiningType.txt": shutil.copy,
  "DerivedNumericValues.txt": shutil.copy,
  "NameAliases.txt": shutil.copy,
  "NormalizationCorrections.txt": shutil.copy,
  "PropertyAliases.txt": shutil.copy,
  "PropertyValueAliases.txt": shutil.copy,
  "SpecialCasing.txt": shutil.copy,
  "UnicodeData.txt": shutil.copy,

  # Copy these files and remove comments behind data lines but not in others.
  "DerivedCoreProperties.txt": CopyAndStrip,
  "DerivedNormalizationProps.txt": CopyAndStrip,
  "GraphemeBreakProperty.txt": CopyAndStrip,
  "NormalizationTest.txt": CopyAndStrip,
  "PropList.txt": CopyAndStrip,
  "Scripts.txt": CopyAndStrip,
  "SentenceBreakProperty.txt": CopyAndStrip,
  "WordBreakProperty.txt": CopyAndStrip,

  # Also merge lines with adjacent code point ranges.
  "EastAsianWidth.txt": CopyAndStripAndMerge,
  "LineBreak.txt": CopyAndStripAndMerge
}

_file_version_re = re.compile("^([a-zA-Z0-9]+)" +
                              "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" +
                              "(\\.[a-z]+)$")

def main():
  source_root = sys.argv[1]
  dest_root = sys.argv[2]
  source_files = []
  for root, dirs, files in os.walk(source_root):
    for file in files:
      source_files.append(os.path.join(root, file))
  files_processed = set()
  for source_file in source_files:
    basename = os.path.basename(source_file)
    match = _file_version_re.match(basename)
    if match:
      basename = match.group(1) + match.group(2)
      print basename
    if basename in _unidata_files:
      if basename in files_processed:
        print "duplicate file basename %s!" % basename
        sys.exit(1)
      files_processed.add(basename)
      dest_file = os.path.join(dest_root, basename)
      _unidata_files[basename](source_file, dest_file)


if __name__ == "__main__":
  main()