1#!/usr/bin/python2.4
2# Copyright (c) 2009 International Business Machines
3# Corporation and others. All Rights Reserved.
4#
5#   file name:  ucdcopy.py
6#   encoding:   US-ASCII
7#   tab size:   8 (not used)
8#   indentation:4
9#
10#   created on: 2009aug04
11#   created by: Markus W. Scherer
12#
13# Copy Unicode Character Database (ucd) files from a tree
14# of files downloaded from ftp://www.unicode.org/Public/5.2.0/
15# to a folder like ICU's source/data/unidata/
16# and modify some of the files to make them more compact.
17#
18# Invoke with two command-line parameters, for the source
19# and destination folders.
20
21import os
22import os.path
23import re
24import shutil
25import sys
26
27_strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*")
28_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
29
30def CopyAndStripWithOptionalMerge(s, t, do_merge):
31  in_file = open(s, "r")
32  out_file = open(t, "w")
33  first = -1  # First code point with first_data.
34  last = -1  # Last code point with first_data.
35  first_data = ""  # Common data for code points [first..last].
36  for line in in_file:
37    match = _strip_re.match(line)
38    if match:
39      line = match.group(1)
40    else:
41      line = line.rstrip()
42    if do_merge:
43      match = _code_point_re.match(line)
44      if match:
45        c = int(match.group(1), 16)
46        data = line[match.end() - 1:]
47      else:
48        c = -1
49        data = ""
50      if last >= 0 and (c != (last + 1) or data != first_data):
51        # output the current range
52        if first == last:
53          out_file.write("%04X%s\n" % (first, first_data))
54        else:
55          out_file.write("%04X..%04X%s\n" % (first, last, first_data))
56        first = -1
57        last = -1
58        first_data = ""
59      if c < 0:
60        # no data on this line, output as is
61        out_file.write(line)
62        out_file.write("\n")
63      else:
64        # data on this line, store for possible range compaction
65        if last < 0:
66          # set as the first line in a possible range
67          first = c
68          last = c
69          first_data = data
70        else:
71          # must be c == (last + 1) and data == first_data
72          # because of previous conditions
73          # continue with the current range
74          last = c
75    else:
76      # Only strip, don't merge: just output the stripped line.
77      out_file.write(line)
78      out_file.write("\n")
79  if do_merge and last >= 0:
80    # output the last range in the file
81    if first == last:
82      out_file.write("%04X%s\n" % (first, first_data))
83    else:
84      out_file.write("%04X..%04X%s\n" % (first, last, first_data))
85    first = -1
86    last = -1
87    first_data = ""
88  in_file.close()
89  out_file.flush()
90  out_file.close()
91
92
93def CopyAndStrip(s, t):
94  """Copies a file and removes comments behind data lines but not in others."""
95  CopyAndStripWithOptionalMerge(s, t, False)
96
97
98def CopyAndStripAndMerge(s, t):
99  """Copies and strips a file and merges lines.
100
101  Copies a file, removes comments, and
102  merges lines with adjacent code point ranges and identical per-code point
103  data lines into one line with range syntax.
104  """
105  CopyAndStripWithOptionalMerge(s, t, True)
106
107
108_unidata_files = {
109  # Simply copy these files.
110  "BidiMirroring.txt": shutil.copy,
111  "BidiTest.txt": shutil.copy,
112  "Blocks.txt": shutil.copy,
113  "CaseFolding.txt": shutil.copy,
114  "DerivedAge.txt": shutil.copy,
115  "DerivedBidiClass.txt": shutil.copy,
116  "DerivedJoiningGroup.txt": shutil.copy,
117  "DerivedJoiningType.txt": shutil.copy,
118  "DerivedNumericValues.txt": shutil.copy,
119  "NameAliases.txt": shutil.copy,
120  "NormalizationCorrections.txt": shutil.copy,
121  "PropertyAliases.txt": shutil.copy,
122  "PropertyValueAliases.txt": shutil.copy,
123  "SpecialCasing.txt": shutil.copy,
124  "UnicodeData.txt": shutil.copy,
125
126  # Copy these files and remove comments behind data lines but not in others.
127  "DerivedCoreProperties.txt": CopyAndStrip,
128  "DerivedNormalizationProps.txt": CopyAndStrip,
129  "GraphemeBreakProperty.txt": CopyAndStrip,
130  "NormalizationTest.txt": CopyAndStrip,
131  "PropList.txt": CopyAndStrip,
132  "Scripts.txt": CopyAndStrip,
133  "SentenceBreakProperty.txt": CopyAndStrip,
134  "WordBreakProperty.txt": CopyAndStrip,
135
136  # Also merge lines with adjacent code point ranges.
137  "EastAsianWidth.txt": CopyAndStripAndMerge,
138  "LineBreak.txt": CopyAndStripAndMerge
139}
140
141_file_version_re = re.compile("^([a-zA-Z0-9]+)" +
142                              "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" +
143                              "(\\.[a-z]+)$")
144
145def main():
146  source_root = sys.argv[1]
147  dest_root = sys.argv[2]
148  source_files = []
149  for root, dirs, files in os.walk(source_root):
150    for file in files:
151      source_files.append(os.path.join(root, file))
152  files_processed = set()
153  for source_file in source_files:
154    basename = os.path.basename(source_file)
155    match = _file_version_re.match(basename)
156    if match:
157      basename = match.group(1) + match.group(2)
158      print basename
159    if basename in _unidata_files:
160      if basename in files_processed:
161        print "duplicate file basename %s!" % basename
162        sys.exit(1)
163      files_processed.add(basename)
164      dest_file = os.path.join(dest_root, basename)
165      _unidata_files[basename](source_file, dest_file)
166
167
168if __name__ == "__main__":
169  main()
170