1ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#!/usr/bin/python2.4 2ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# Copyright (c) 2009 International Business Machines 3ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# Corporation and others. All Rights Reserved. 4ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# 5ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# file name: ucdcopy.py 6ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# encoding: US-ASCII 7ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# tab size: 8 (not used) 8ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# indentation:4 9ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# 10ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# created on: 2009aug04 11ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# created by: Markus W. Scherer 12ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# 13ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# Copy Unicode Character Database (ucd) files from a tree 14ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# of files downloaded from ftp://www.unicode.org/Public/5.2.0/ 15ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# to a folder like ICU's source/data/unidata/ 16ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# and modify some of the files to make them more compact. 17ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# 18ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# Invoke with two command-line parameters, for the source 19ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho# and destination folders. 20ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 21ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoimport os 22ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoimport os.path 23ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoimport re 24ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoimport shutil 25ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoimport sys 26ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 27ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho_strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*") 28ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;") 29ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 30ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehodef CopyAndStripWithOptionalMerge(s, t, do_merge): 31ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho in_file = open(s, "r") 32ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file = open(t, "w") 33ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first = -1 # First code point with first_data. 34ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho last = -1 # Last code point with first_data. 35ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first_data = "" # Common data for code points [first..last]. 36ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho for line in in_file: 37ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho match = _strip_re.match(line) 38ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if match: 39ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho line = match.group(1) 40ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho else: 41ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho line = line.rstrip() 42ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if do_merge: 43ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho match = _code_point_re.match(line) 44ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if match: 45ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho c = int(match.group(1), 16) 46ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho data = line[match.end() - 1:] 47ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho else: 48ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho c = -1 49ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho data = "" 50ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if last >= 0 and (c != (last + 1) or data != first_data): 51ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # output the current range 52ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if first == last: 53ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file.write("%04X%s\n" % (first, first_data)) 54ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho else: 55ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 56ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first = -1 57ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho last = -1 58ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first_data = "" 59ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if c < 0: 60ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # no data on this line, output as is 61ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file.write(line) 62ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file.write("\n") 63ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho else: 64ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # data on this line, store for possible range compaction 65ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if last < 0: 66ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # set as the first line in a possible range 67ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first = c 68ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho last = c 69ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first_data = data 70ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho else: 71ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # must be c == (last + 1) and data == first_data 72ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # because of previous conditions 73ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # continue with the current range 74ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho last = c 75ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho else: 76ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # Only strip, don't merge: just output the stripped line. 77ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file.write(line) 78ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file.write("\n") 79ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if do_merge and last >= 0: 80ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # output the last range in the file 81ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if first == last: 82ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file.write("%04X%s\n" % (first, first_data)) 83ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho else: 84ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 85ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first = -1 86ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho last = -1 87ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first_data = "" 88ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho in_file.close() 89ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file.flush() 90ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho out_file.close() 91ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 92ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 93ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehodef CopyAndStrip(s, t): 94ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho """Copies a file and removes comments behind data lines but not in others.""" 95ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho CopyAndStripWithOptionalMerge(s, t, False) 96ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 97ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 98ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehodef CopyAndStripAndMerge(s, t): 99ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho """Copies and strips a file and merges lines. 100ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 101ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho Copies a file, removes comments, and 102ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho merges lines with adjacent code point ranges and identical per-code point 103ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho data lines into one line with range syntax. 104ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho """ 105ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho CopyAndStripWithOptionalMerge(s, t, True) 106ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 107ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 108ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho_unidata_files = { 109ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # Simply copy these files. 110ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "BidiMirroring.txt": shutil.copy, 111ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "BidiTest.txt": shutil.copy, 112ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "Blocks.txt": shutil.copy, 113ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "CaseFolding.txt": shutil.copy, 114ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "DerivedAge.txt": shutil.copy, 115ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "DerivedBidiClass.txt": shutil.copy, 116ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "DerivedJoiningGroup.txt": shutil.copy, 117ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "DerivedJoiningType.txt": shutil.copy, 118ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "DerivedNumericValues.txt": shutil.copy, 119ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "NameAliases.txt": shutil.copy, 120ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "NormalizationCorrections.txt": shutil.copy, 121ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "PropertyAliases.txt": shutil.copy, 122ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "PropertyValueAliases.txt": shutil.copy, 123ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "SpecialCasing.txt": shutil.copy, 124ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "UnicodeData.txt": shutil.copy, 125ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 126ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # Copy these files and remove comments behind data lines but not in others. 127ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "DerivedCoreProperties.txt": CopyAndStrip, 128ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "DerivedNormalizationProps.txt": CopyAndStrip, 129ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "GraphemeBreakProperty.txt": CopyAndStrip, 130ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "NormalizationTest.txt": CopyAndStrip, 131ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "PropList.txt": CopyAndStrip, 132ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "Scripts.txt": CopyAndStrip, 133ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "SentenceBreakProperty.txt": CopyAndStrip, 134ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "WordBreakProperty.txt": CopyAndStrip, 135ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 136ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho # Also merge lines with adjacent code point ranges. 137ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "EastAsianWidth.txt": CopyAndStripAndMerge, 138ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "LineBreak.txt": CopyAndStripAndMerge 139ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho} 140ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 141ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho_file_version_re = re.compile("^([a-zA-Z0-9]+)" + 142ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" + 143ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho "(\\.[a-z]+)$") 144ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 145ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehodef main(): 146ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho source_root = sys.argv[1] 147ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho dest_root = sys.argv[2] 148ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho source_files = [] 149ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho for root, dirs, files in os.walk(source_root): 150ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho for file in files: 151ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho source_files.append(os.path.join(root, file)) 152ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho files_processed = set() 153ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho for source_file in source_files: 154ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho basename = os.path.basename(source_file) 155ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho match = _file_version_re.match(basename) 156ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if match: 157ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho basename = match.group(1) + match.group(2) 158ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho print basename 159ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if basename in _unidata_files: 160ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if basename in files_processed: 161ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho print "duplicate file basename %s!" % basename 162ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho sys.exit(1) 163ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho files_processed.add(basename) 164ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho dest_file = os.path.join(dest_root, basename) 165ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho _unidata_files[basename](source_file, dest_file) 166ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 167ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 168ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoif __name__ == "__main__": 169ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho main() 170