extract_unicode_properties.py revision 66ea8400383d5737b996a136f3aead0965f7be3d
1#!/bin/env python3
2
3"""Extracts the XID_Start and XID_Continue Derived core properties from the ICU data files
4and emits a std::array<> for binary searching.
5"""
6
7import re
8import sys
9
10CharacterPropertyEnumMap = {
11        1: "CharacterProperties::kXidStart",
12        2: "CharacterProperties::kXidContinue"
13}
14
15class CharacterProperty:
16    def __init__(self, first_char, last_char, prop_type):
17        self.first_char = first_char
18        self.last_char = last_char
19        self.prop_type = prop_type
20
21    def key(self):
22        return self.first_char
23
24    def merge(self, other):
25        if self.last_char + 1 == other.first_char and self.prop_type == other.prop_type:
26            self.last_char = other.last_char
27        else:
28            raise KeyError()
29
30    def __repr__(self):
31        types = []
32        for enum_int, enum_str in CharacterPropertyEnumMap.items():
33            if enum_int & self.prop_type:
34                types.append(enum_str)
35        return "{}0x{:04x}, 0x{:04x}, {}{}".format(
36                "{", self.first_char, self.last_char, ' | '.join(types), "}")
37
38def extract_unicode_properties(f, props):
39    prog = re.compile(r"^(?P<first>\w{4})(..(?P<last>\w{4}))?\W+;\W+(?P<prop>\w+)\n$")
40    chars = {}
41    for line in f:
42        result = prog.match(line)
43        if result:
44            prop_type_str = result.group('prop')
45            first_char_str = result.group('first')
46            last_char_str = result.group('last')
47            if prop_type_str in props:
48                start_char = int(first_char_str, 16)
49                last_char = (int(last_char_str, 16) if last_char_str else start_char) + 1
50                prop_type = props[prop_type_str]
51                for char in range(start_char, last_char):
52                    if char not in chars:
53                        chars[char] = CharacterProperty(char, char, 0)
54                    chars[char].prop_type |= prop_type
55
56    result = []
57    for char_prop in sorted(chars.values(), key=CharacterProperty.key):
58        if len(result) == 0:
59            result.append(char_prop)
60        else:
61            try:
62                result[len(result) - 1].merge(char_prop)
63            except KeyError:
64                result.append(char_prop)
65    return result
66
67license = """/*
68 * Copyright (C) 2017 The Android Open Source Project
69 *
70 * Licensed under the Apache License, Version 2.0 (the "License");
71 * you may not use this file except in compliance with the License.
72 * You may obtain a copy of the License at
73 *
74 *      http://www.apache.org/licenses/LICENSE-2.0
75 *
76 * Unless required by applicable law or agreed to in writing, software
77 * distributed under the License is distributed on an "AS IS" BASIS,
78 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
79 * See the License for the specific language governing permissions and
80 * limitations under the License.
81 */
82"""
83
84if __name__ == "__main__":
85    if len(sys.argv) != 2:
86        print("must specify path to icu DerivedCoreProperties file (e.g:" \
87                "external/icu/icu4c/source/data/unidata/DerivedCoreProperties.txt)")
88        sys.exit(1)
89
90    with open(sys.argv[1]) as f:
91        props = {"XID_Start": 1, "XID_Continue": 2}
92        char_props = extract_unicode_properties(f, props)
93        print("{}\nconst static std::array<CharacterProperties, {}> sCharacterProperties = {}"
94                .format(license, len(char_props), "{{"))
95        for prop in char_props:
96            print("    {},".format(prop))
97        print("}};")
98
99