extract_unicode_properties.py revision e967d3f6ac2e1e1f612f99b9c76abcb9e13bb7a2
1#!/bin/env python3
2
3"""Extracts the XID_Start and XID_Continue Derived core properties from the ICU data files
4and emits a std::array<> for binary searching.
5"""
6
7import re
8import sys
9
10CharacterPropertyEnumMap = {
11        1: "CharacterProperties::kXidStart",
12        2: "CharacterProperties::kXidContinue"
13}
14
15class CharacterProperty:
16    def __init__(self, first_char, last_char, prop_type):
17        self.first_char = first_char
18        self.last_char = last_char
19        self.prop_type = prop_type
20
21    def key(self):
22        return self.first_char
23
24    def merge(self, other):
25        if self.last_char + 1 == other.first_char and self.prop_type == other.prop_type:
26            self.last_char = other.last_char
27        else:
28            raise KeyError()
29
30    def __repr__(self):
31        types = []
32        for enum_int, enum_str in CharacterPropertyEnumMap.items():
33            if enum_int & self.prop_type:
34                types.append(enum_str)
35        return "{}0x{:04x}, 0x{:04x}, {}{}".format(
36                "{", self.first_char, self.last_char, ' | '.join(types), "}")
37
38def extract_unicode_properties(f, props, chars_out):
39    prog = re.compile(r"^(?P<first>\w{4})(..(?P<last>\w{4}))?\W+;\W+(?P<prop>\w+)")
40    for line in f:
41        result = prog.match(line)
42        if result:
43            prop_type_str = result.group('prop')
44            first_char_str = result.group('first')
45            last_char_str = result.group('last')
46            if prop_type_str in props:
47                start_char = int(first_char_str, 16)
48                last_char = (int(last_char_str, 16) if last_char_str else start_char) + 1
49                prop_type = props[prop_type_str]
50                for char in range(start_char, last_char):
51                    if char not in chars_out:
52                        chars_out[char] = CharacterProperty(char, char, 0)
53                    chars_out[char].prop_type |= prop_type
54    return chars_out
55
56def flatten_unicode_properties(chars):
57    result = []
58    for char_prop in sorted(chars.values(), key=CharacterProperty.key):
59        if len(result) == 0:
60            result.append(char_prop)
61        else:
62            try:
63                result[len(result) - 1].merge(char_prop)
64            except KeyError:
65                result.append(char_prop)
66    return result
67
68license = """/*
69 * Copyright (C) 2017 The Android Open Source Project
70 *
71 * Licensed under the Apache License, Version 2.0 (the "License");
72 * you may not use this file except in compliance with the License.
73 * You may obtain a copy of the License at
74 *
75 *      http://www.apache.org/licenses/LICENSE-2.0
76 *
77 * Unless required by applicable law or agreed to in writing, software
78 * distributed under the License is distributed on an "AS IS" BASIS,
79 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
80 * See the License for the specific language governing permissions and
81 * limitations under the License.
82 */
83"""
84
85if __name__ == "__main__":
86    if len(sys.argv) < 2:
87        print("must specify path to icu DerivedCoreProperties file (e.g:" \
88                "external/icu/icu4c/source/data/unidata/DerivedCoreProperties.txt)")
89        sys.exit(1)
90
91    props = {"XID_Start": 1, "XID_Continue": 2}
92    char_props = {}
93    for file_path in sys.argv[1:]:
94        with open(file_path) as f:
95            extract_unicode_properties(f, props, char_props)
96    result = flatten_unicode_properties(char_props)
97    print("{}\nconst static std::array<CharacterProperties, {}> sCharacterProperties = {}"
98            .format(license, len(result), "{{"))
99    for prop in result:
100        print("    {},".format(prop))
101    print("}};")
102
103