extract_unicode_properties.py revision 66ea8400383d5737b996a136f3aead0965f7be3d
1#!/bin/env python3 2 3"""Extracts the XID_Start and XID_Continue Derived core properties from the ICU data files 4and emits a std::array<> for binary searching. 5""" 6 7import re 8import sys 9 10CharacterPropertyEnumMap = { 11 1: "CharacterProperties::kXidStart", 12 2: "CharacterProperties::kXidContinue" 13} 14 15class CharacterProperty: 16 def __init__(self, first_char, last_char, prop_type): 17 self.first_char = first_char 18 self.last_char = last_char 19 self.prop_type = prop_type 20 21 def key(self): 22 return self.first_char 23 24 def merge(self, other): 25 if self.last_char + 1 == other.first_char and self.prop_type == other.prop_type: 26 self.last_char = other.last_char 27 else: 28 raise KeyError() 29 30 def __repr__(self): 31 types = [] 32 for enum_int, enum_str in CharacterPropertyEnumMap.items(): 33 if enum_int & self.prop_type: 34 types.append(enum_str) 35 return "{}0x{:04x}, 0x{:04x}, {}{}".format( 36 "{", self.first_char, self.last_char, ' | '.join(types), "}") 37 38def extract_unicode_properties(f, props): 39 prog = re.compile(r"^(?P<first>\w{4})(..(?P<last>\w{4}))?\W+;\W+(?P<prop>\w+)\n$") 40 chars = {} 41 for line in f: 42 result = prog.match(line) 43 if result: 44 prop_type_str = result.group('prop') 45 first_char_str = result.group('first') 46 last_char_str = result.group('last') 47 if prop_type_str in props: 48 start_char = int(first_char_str, 16) 49 last_char = (int(last_char_str, 16) if last_char_str else start_char) + 1 50 prop_type = props[prop_type_str] 51 for char in range(start_char, last_char): 52 if char not in chars: 53 chars[char] = CharacterProperty(char, char, 0) 54 chars[char].prop_type |= prop_type 55 56 result = [] 57 for char_prop in sorted(chars.values(), key=CharacterProperty.key): 58 if len(result) == 0: 59 result.append(char_prop) 60 else: 61 try: 62 result[len(result) - 1].merge(char_prop) 63 except KeyError: 64 result.append(char_prop) 65 return result 66 67license = """/* 68 * Copyright (C) 2017 The Android Open Source Project 69 * 70 * Licensed under the Apache License, Version 2.0 (the "License"); 71 * you may not use this file except in compliance with the License. 72 * You may obtain a copy of the License at 73 * 74 * http://www.apache.org/licenses/LICENSE-2.0 75 * 76 * Unless required by applicable law or agreed to in writing, software 77 * distributed under the License is distributed on an "AS IS" BASIS, 78 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 79 * See the License for the specific language governing permissions and 80 * limitations under the License. 81 */ 82""" 83 84if __name__ == "__main__": 85 if len(sys.argv) != 2: 86 print("must specify path to icu DerivedCoreProperties file (e.g:" \ 87 "external/icu/icu4c/source/data/unidata/DerivedCoreProperties.txt)") 88 sys.exit(1) 89 90 with open(sys.argv[1]) as f: 91 props = {"XID_Start": 1, "XID_Continue": 2} 92 char_props = extract_unicode_properties(f, props) 93 print("{}\nconst static std::array<CharacterProperties, {}> sCharacterProperties = {}" 94 .format(license, len(char_props), "{{")) 95 for prop in char_props: 96 print(" {},".format(prop)) 97 print("}};") 98 99