extract_unicode_properties.py revision e967d3f6ac2e1e1f612f99b9c76abcb9e13bb7a2
1#!/bin/env python3 2 3"""Extracts the XID_Start and XID_Continue Derived core properties from the ICU data files 4and emits a std::array<> for binary searching. 5""" 6 7import re 8import sys 9 10CharacterPropertyEnumMap = { 11 1: "CharacterProperties::kXidStart", 12 2: "CharacterProperties::kXidContinue" 13} 14 15class CharacterProperty: 16 def __init__(self, first_char, last_char, prop_type): 17 self.first_char = first_char 18 self.last_char = last_char 19 self.prop_type = prop_type 20 21 def key(self): 22 return self.first_char 23 24 def merge(self, other): 25 if self.last_char + 1 == other.first_char and self.prop_type == other.prop_type: 26 self.last_char = other.last_char 27 else: 28 raise KeyError() 29 30 def __repr__(self): 31 types = [] 32 for enum_int, enum_str in CharacterPropertyEnumMap.items(): 33 if enum_int & self.prop_type: 34 types.append(enum_str) 35 return "{}0x{:04x}, 0x{:04x}, {}{}".format( 36 "{", self.first_char, self.last_char, ' | '.join(types), "}") 37 38def extract_unicode_properties(f, props, chars_out): 39 prog = re.compile(r"^(?P<first>\w{4})(..(?P<last>\w{4}))?\W+;\W+(?P<prop>\w+)") 40 for line in f: 41 result = prog.match(line) 42 if result: 43 prop_type_str = result.group('prop') 44 first_char_str = result.group('first') 45 last_char_str = result.group('last') 46 if prop_type_str in props: 47 start_char = int(first_char_str, 16) 48 last_char = (int(last_char_str, 16) if last_char_str else start_char) + 1 49 prop_type = props[prop_type_str] 50 for char in range(start_char, last_char): 51 if char not in chars_out: 52 chars_out[char] = CharacterProperty(char, char, 0) 53 chars_out[char].prop_type |= prop_type 54 return chars_out 55 56def flatten_unicode_properties(chars): 57 result = [] 58 for char_prop in sorted(chars.values(), key=CharacterProperty.key): 59 if len(result) == 0: 60 result.append(char_prop) 61 else: 62 try: 63 result[len(result) - 1].merge(char_prop) 64 except KeyError: 65 result.append(char_prop) 66 return result 67 68license = """/* 69 * Copyright (C) 2017 The Android Open Source Project 70 * 71 * Licensed under the Apache License, Version 2.0 (the "License"); 72 * you may not use this file except in compliance with the License. 73 * You may obtain a copy of the License at 74 * 75 * http://www.apache.org/licenses/LICENSE-2.0 76 * 77 * Unless required by applicable law or agreed to in writing, software 78 * distributed under the License is distributed on an "AS IS" BASIS, 79 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 80 * See the License for the specific language governing permissions and 81 * limitations under the License. 82 */ 83""" 84 85if __name__ == "__main__": 86 if len(sys.argv) < 2: 87 print("must specify path to icu DerivedCoreProperties file (e.g:" \ 88 "external/icu/icu4c/source/data/unidata/DerivedCoreProperties.txt)") 89 sys.exit(1) 90 91 props = {"XID_Start": 1, "XID_Continue": 2} 92 char_props = {} 93 for file_path in sys.argv[1:]: 94 with open(file_path) as f: 95 extract_unicode_properties(f, props, char_props) 96 result = flatten_unicode_properties(char_props) 97 print("{}\nconst static std::array<CharacterProperties, {}> sCharacterProperties = {}" 98 .format(license, len(result), "{{")) 99 for prop in result: 100 print(" {},".format(prop)) 101 print("}};") 102 103