1ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#!/usr/bin/python2.4
2ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# Copyright (c) 2009 International Business Machines
3ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# Corporation and others. All Rights Reserved.
4ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#
5ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#   file name:  ucdcopy.py
6ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#   encoding:   US-ASCII
7ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#   tab size:   8 (not used)
8ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#   indentation:4
9ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#
10ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#   created on: 2009aug04
11ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#   created by: Markus W. Scherer
12ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#
13ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# Copy Unicode Character Database (ucd) files from a tree
14ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# of files downloaded from ftp://www.unicode.org/Public/5.2.0/
15ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# to a folder like ICU's source/data/unidata/
16ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# and modify some of the files to make them more compact.
17ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#
18ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# Invoke with two command-line parameters, for the source
19ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# and destination folders.
20ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
21ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoimport os
22ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoimport os.path
23ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoimport re
24ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoimport shutil
25ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoimport sys
26ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
27ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho_strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*")
28ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
29ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
30ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehodef CopyAndStripWithOptionalMerge(s, t, do_merge):
31ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  in_file = open(s, "r")
32ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  out_file = open(t, "w")
33ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  first = -1  # First code point with first_data.
34ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  last = -1  # Last code point with first_data.
35ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  first_data = ""  # Common data for code points [first..last].
36ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  for line in in_file:
37ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    match = _strip_re.match(line)
38ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    if match:
39ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      line = match.group(1)
40ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    else:
41ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      line = line.rstrip()
42ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    if do_merge:
43ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      match = _code_point_re.match(line)
44ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      if match:
45ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        c = int(match.group(1), 16)
46ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        data = line[match.end() - 1:]
47ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      else:
48ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        c = -1
49ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        data = ""
50ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      if last >= 0 and (c != (last + 1) or data != first_data):
51ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        # output the current range
52ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        if first == last:
53ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho          out_file.write("%04X%s\n" % (first, first_data))
54ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        else:
55ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho          out_file.write("%04X..%04X%s\n" % (first, last, first_data))
56ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        first = -1
57ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        last = -1
58ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        first_data = ""
59ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      if c < 0:
60ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        # no data on this line, output as is
61ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        out_file.write(line)
62ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        out_file.write("\n")
63ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      else:
64ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        # data on this line, store for possible range compaction
65ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        if last < 0:
66ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho          # set as the first line in a possible range
67ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho          first = c
68ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho          last = c
69ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho          first_data = data
70ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        else:
71ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho          # must be c == (last + 1) and data == first_data
72ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho          # because of previous conditions
73ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho          # continue with the current range
74ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho          last = c
75ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    else:
76ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      # Only strip, don't merge: just output the stripped line.
77ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      out_file.write(line)
78ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      out_file.write("\n")
79ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  if do_merge and last >= 0:
80ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    # output the last range in the file
81ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    if first == last:
82ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      out_file.write("%04X%s\n" % (first, first_data))
83ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    else:
84ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      out_file.write("%04X..%04X%s\n" % (first, last, first_data))
85ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    first = -1
86ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    last = -1
87ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    first_data = ""
88ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  in_file.close()
89ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  out_file.flush()
90ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  out_file.close()
91ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
92ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
93ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehodef CopyAndStrip(s, t):
94ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  """Copies a file and removes comments behind data lines but not in others."""
95ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  CopyAndStripWithOptionalMerge(s, t, False)
96ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
97ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
98ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehodef CopyAndStripAndMerge(s, t):
99ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  """Copies and strips a file and merges lines.
100ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
101ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  Copies a file, removes comments, and
102ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  merges lines with adjacent code point ranges and identical per-code point
103ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  data lines into one line with range syntax.
104ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  """
105ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  CopyAndStripWithOptionalMerge(s, t, True)
106ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
107ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
108ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho_unidata_files = {
109ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  # Simply copy these files.
110ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "BidiMirroring.txt": shutil.copy,
111ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "BidiTest.txt": shutil.copy,
112ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "Blocks.txt": shutil.copy,
113ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "CaseFolding.txt": shutil.copy,
114ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "DerivedAge.txt": shutil.copy,
115ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "DerivedBidiClass.txt": shutil.copy,
116ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "DerivedJoiningGroup.txt": shutil.copy,
117ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "DerivedJoiningType.txt": shutil.copy,
118ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "DerivedNumericValues.txt": shutil.copy,
119ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "NameAliases.txt": shutil.copy,
120ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "NormalizationCorrections.txt": shutil.copy,
121ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "PropertyAliases.txt": shutil.copy,
122ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "PropertyValueAliases.txt": shutil.copy,
123ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "SpecialCasing.txt": shutil.copy,
124ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "UnicodeData.txt": shutil.copy,
125ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
126ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  # Copy these files and remove comments behind data lines but not in others.
127ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "DerivedCoreProperties.txt": CopyAndStrip,
128ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "DerivedNormalizationProps.txt": CopyAndStrip,
129ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "GraphemeBreakProperty.txt": CopyAndStrip,
130ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "NormalizationTest.txt": CopyAndStrip,
131ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "PropList.txt": CopyAndStrip,
132ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "Scripts.txt": CopyAndStrip,
133ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "SentenceBreakProperty.txt": CopyAndStrip,
134ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "WordBreakProperty.txt": CopyAndStrip,
135ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
136ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  # Also merge lines with adjacent code point ranges.
137ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "EastAsianWidth.txt": CopyAndStripAndMerge,
138ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  "LineBreak.txt": CopyAndStripAndMerge
139ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho}
140ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
141ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho_file_version_re = re.compile("^([a-zA-Z0-9]+)" +
142ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                              "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" +
143ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                              "(\\.[a-z]+)$")
144ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
145ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehodef main():
146ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  source_root = sys.argv[1]
147ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  dest_root = sys.argv[2]
148ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  source_files = []
149ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  for root, dirs, files in os.walk(source_root):
150ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    for file in files:
151ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      source_files.append(os.path.join(root, file))
152ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  files_processed = set()
153ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  for source_file in source_files:
154ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    basename = os.path.basename(source_file)
155ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    match = _file_version_re.match(basename)
156ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    if match:
157ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      basename = match.group(1) + match.group(2)
158ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      print basename
159ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    if basename in _unidata_files:
160ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      if basename in files_processed:
161ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        print "duplicate file basename %s!" % basename
162ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        sys.exit(1)
163ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      files_processed.add(basename)
164ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      dest_file = os.path.join(dest_root, basename)
165ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho      _unidata_files[basename](source_file, dest_file)
166ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
167ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
168ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoif __name__ == "__main__":
169ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho  main()
170