1f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien#!/usr/bin/env python
2f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
3f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# Copyright (C) 2015 The Android Open Source Project
4f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien#
5f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# Licensed under the Apache License, Version 2.0 (the 'License');
6f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# you may not use this file except in compliance with the License.
7f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# You may obtain a copy of the License at
8f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien#
9f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien#      http://www.apache.org/licenses/LICENSE-2.0
10f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien#
11f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# Unless required by applicable law or agreed to in writing, software
12f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# distributed under the License is distributed on an 'AS IS' BASIS,
13f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# See the License for the specific language governing permissions and
15f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# limitations under the License.
16f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
17f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien"""
18f0be43de02a1e07308d3d95408349c3c7f973430Raph LevienConvert hyphen files in standard TeX format (a trio of pat, chr, and hyp)
19f0be43de02a1e07308d3d95408349c3c7f973430Raph Levieninto binary format. See doc/hyb_file_format.md for more information.
20f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
21f0be43de02a1e07308d3d95408349c3c7f973430Raph LevienUsage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb
22f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
23f0be43de02a1e07308d3d95408349c3c7f973430Raph LevienOptional -v parameter turns on verbose debugging.
24f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
25f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien"""
26f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
27f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienfrom __future__ import print_function
28f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
29f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienimport io
30f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienimport sys
31f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienimport struct
32f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienimport math
33f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienimport getopt
34f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
35f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
36f0be43de02a1e07308d3d95408349c3c7f973430Raph LevienVERBOSE = False
37f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
38f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
39f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienif sys.version_info[0] >= 3:
40f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def unichr(x):
41f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return chr(x)
42f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
43f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
44f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# number of bits required to represent numbers up to n inclusive
45f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef num_bits(n):
46f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return 1 + int(math.log(n, 2)) if n > 0 else 0
47f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
48f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
49f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienclass Node:
50f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
51f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def __init__(self):
52f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.succ = {}
53f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.res = None
54f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.fsm_pat = None
55f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.fail = None
56f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
57f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
58f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# List of free slots, implemented as doubly linked list
59f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienclass Freelist:
60f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
61f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def __init__(self):
62f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.first = None
63f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.last = None
64f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.pred = []
65f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.succ = []
66f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
67f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def grow(self):
68f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        this = len(self.pred)
69f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.pred.append(self.last)
70f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.succ.append(None)
71f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if self.last is None:
72f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            self.first = this
73f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        else:
74f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            self.succ[self.last] = this
75f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.last = this
76f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
77f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def next(self, cursor):
78f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if cursor == 0:
79f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            cursor = self.first
80f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if cursor is None:
81f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            self.grow()
82f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            result = self.last
83f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        else:
84f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            result = cursor
85f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return result, self.succ[result]
86f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
87f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def is_free(self, ix):
88f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        while ix >= len(self.pred):
89f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            self.grow()
90f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return self.pred[ix] != -1
91f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
92f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def use(self, ix):
93f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if self.pred[ix] is None:
94f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            self.first = self.succ[ix]
95f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        else:
96f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            self.succ[self.pred[ix]] = self.succ[ix]
97f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if self.succ[ix] is None:
98f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            self.last = self.pred[ix]
99f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        else:
100f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            self.pred[self.succ[ix]] = self.pred[ix]
101f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if self.pred[ix] == -1:
102f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            assert self.pred[ix] != -1, 'double free!'
103f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.pred[ix] = -1
104f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
105f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
106f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef combine(a, b):
107f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if a is None: return b
108f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if b is None: return a
109f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if len(b) < len(a): a, b = b, a
110f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    res = b[:len(b) - len(a)]
111f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for i in range(len(a)):
112f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        res.append(max(a[i], b[i + len(b) - len(a)]))
113f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return res
114f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
115f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
116f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef trim(pattern):
117f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for ix in range(len(pattern)):
118f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if pattern[ix] != 0:
119f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            return pattern[ix:]
120f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
121f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
122f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef pat_to_binary(pattern):
123f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return b''.join(struct.pack('B', x) for x in pattern)
124f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
125f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
126f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienclass Hyph:
127f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
128f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def __init__(self):
129f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.root = Node()
130f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.root.str = '<root>'
131f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.node_list = [self.root]
132f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
133f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    # Add a pattern (word fragment with numeric codes, such as ".ad4der")
134f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def add_pat(self, pat):
135f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        lastWasLetter = False
136f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        haveSeenNumber = False
137f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        result = []
138f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        word = ''
139f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for c in pat:
140f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if c.isdigit():
141f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                result.append(int(c))
142f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                lastWasLetter = False
143f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                haveSeenNumber = True
144f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            else:
145f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                word += c
146f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                if lastWasLetter and haveSeenNumber:
147f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                    result.append(0)
148f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                lastWasLetter = True
149f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if lastWasLetter:
150f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            result.append(0)
151f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
152f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.add_word_res(word, result)
153f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
154f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    # Add an exception (word with hyphens, such as "ta-ble")
155f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def add_exception(self, hyph_word):
156f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        res = []
157f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        word = ['.']
158f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        need_10 = False
159f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for c in hyph_word:
160f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if c == '-':
161f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                res.append(11)
162f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                need_10 = False
163f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            else:
164f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                if need_10:
165f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                    res.append(10)
166f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                word.append(c)
167f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                need_10 = True
168f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        word.append('.')
169f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        res.append(0)
170f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        res.append(0)
171f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if VERBOSE:
172f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            print(word, res)
173f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.add_word_res(''.join(word), res)
174f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
175f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def add_word_res(self, word, result):
176f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if VERBOSE:
177f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            print(word, result)
178f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
179f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        t = self.root
180f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        s = ''
181f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for c in word:
182f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            s += c
183f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if c not in t.succ:
184f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                new_node = Node()
185f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                new_node.str = s
186f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                self.node_list.append(new_node)
187f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                t.succ[c] = new_node
188f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            t = t.succ[c]
189f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        t.res = result
190f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
191f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def pack(self, node_list, ch_map, use_node=False):
192f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        size = 0
193f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.node_map = {}
194f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        nodes = Freelist()
195f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        edges = Freelist()
196f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        edge_start = 1 if use_node else 0
197f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for node in node_list:
198f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()])
199f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if len(succ):
200f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                cursor = 0
201f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                while True:
202f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                    edge_ix, cursor = edges.next(cursor)
203f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                    ix = edge_ix - succ[0]
204f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                    if (ix >= 0 and nodes.is_free(ix) and
205f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                            all(edges.is_free(ix + s) for s in succ) and
206f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                            ((not use_node) or edges.is_free(ix))):
207f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                        break
208f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            elif use_node:
209f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                ix, _ = edges.next(0)
210f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                nodes.is_free(ix)  # actually don't need nodes at all when use_node,
211f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                # but keep it happy
212f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            else:
213f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                ix, _ = nodes.next(0)
214f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            node.ix = ix
215f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            self.node_map[ix] = node
216f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            nodes.use(ix)
217f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            size = max(size, ix)
218f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if use_node:
219f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                edges.use(ix)
220f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            for s in succ:
221f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                edges.use(ix + s)
222f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        size += max(ch_map.values()) + 1
223f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return size
224f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
225f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    # return list of nodes in bfs order
226f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def bfs(self, ch_map):
227f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        result = [self.root]
228f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        ix = 0
229f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        while ix < len(result):
230f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            node = result[ix]
231f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            node.bfs_ix = ix
232f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            mapped = {}
233f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            for c, next in node.succ.items():
234f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c))
235f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                mapped[ch_map[c]] = next
236f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            for i in sorted(mapped.keys()):
237f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                result.append(mapped[i])
238f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            ix += 1
239f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        self.bfs_order = result
240f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return result
241f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
242f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    # suffix compression - convert the trie into an acyclic digraph, merging nodes when
243f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    # the subtries are identical
244f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    def dedup(self):
245f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        uniques = []
246f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        dupmap = {}
247f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        dedup_ix = [0] * len(self.bfs_order)
248f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for ix in reversed(range(len(self.bfs_order))):
249f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            # construct string representation of node
250f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            node = self.bfs_order[ix]
251f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if node.res is None:
252f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                s = ''
253f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            else:
254f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                s = ''.join(str(c) for c in node.res)
255f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            for c in sorted(node.succ.keys()):
256f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                succ = node.succ[c]
257f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                s += ' ' + c + str(dedup_ix[succ.bfs_ix])
258f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if s in dupmap:
259f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                dedup_ix[ix] = dupmap[s]
260f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            else:
261f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                uniques.append(node)
262f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                dedup_ix[ix] = ix
263f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            dupmap[s] = dedup_ix[ix]
264f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        uniques.reverse()
265f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total')
266f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return dedup_ix, uniques
267f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
268f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
269f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# load the ".pat" file, which contains patterns such as a1b2c3
270f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef load(fn):
271f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    hyph = Hyph()
272f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    with io.open(fn, encoding='UTF-8') as f:
273f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for l in f:
274f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            pat = l.strip()
275f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            hyph.add_pat(pat)
276f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return hyph
277f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
278f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
279f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc.
280f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef load_chr(fn):
281f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    ch_map = {'.': 0}
282f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    with io.open(fn, encoding='UTF-8') as f:
283f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for i, l in enumerate(f):
284f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            l = l.strip()
285f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if len(l) > 2:
286f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                # lowercase maps to multi-character uppercase sequence, ignore uppercase for now
287f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                l = l[:1]
288f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            else:
289f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                assert len(l) == 2, 'expected 2 chars in chr'
290f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            for c in l:
291f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                ch_map[c] = i + 1
292f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return ch_map
293f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
294f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
295f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# load exceptions with explicit hyphens
296f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef load_hyp(hyph, fn):
297f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    with io.open(fn, encoding='UTF-8') as f:
298f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for l in f:
299f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            hyph.add_exception(l.strip())
300f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
301f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
302f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef generate_header(alphabet, trie, pattern):
303f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    alphabet_off = 6 * 4
304f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    trie_off = alphabet_off + len(alphabet)
305f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    pattern_off = trie_off + len(trie)
306f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    file_size = pattern_off + len(pattern)
307f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size]
308f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return struct.pack('<6I', *data)
309f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
310f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
311f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef generate_alphabet(ch_map):
312f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    ch_map = ch_map.copy()
313f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    del ch_map['.']
314f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    min_ch = ord(min(ch_map))
315f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    max_ch = ord(max(ch_map))
316f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if max_ch - min_ch < 1024 and max(ch_map.values()) < 256:
317f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        # generate format 0
318f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        data = [0] * (max_ch - min_ch + 1)
319f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for c, val in ch_map.items():
320f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            data[ord(c) - min_ch] = val
321f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        result = [struct.pack('<3I', 0, min_ch, max_ch + 1)]
322f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for b in data:
323f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            result.append(struct.pack('<B', b))
324f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    else:
325f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        # generate format 1
326f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded'
327f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        result = [struct.pack('<2I', 1, len(ch_map))]
328f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for c, val in sorted(ch_map.items()):
329f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            data = (ord(c) << 11) | val
330f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            result.append(struct.pack('<I', data))
331f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    binary = b''.join(result)
332f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if len(binary) % 4 != 0:
333f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        binary += b'\x00' * (4 - len(binary) % 4)
334f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return binary
335f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
336f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
337f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# assumes hyph structure has been packed, ie node.ix values have been set
338f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap):
339f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    ch_array = [0] * n_trie
340f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    link_array = [0] * n_trie
341f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    pat_array = [0] * n_trie
342f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    link_shift = num_bits(max(ch_map.values()))
343f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    char_mask = (1 << link_shift) - 1
344f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    pattern_shift = link_shift + num_bits(n_trie - 1)
345f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    link_mask = (1 << pattern_shift) - (1 << link_shift)
346f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)]
347f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
348f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for node in dedup_nodes:
349f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        ix = node.ix
350f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if node.res is not None:
351f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            pat_array[ix] = patmap[pat_to_binary(node.res)]
352f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for c, next in node.succ.items():
353f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            c_num = ch_map[c]
354f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            link_ix = ix + c_num
355f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            ch_array[link_ix] = c_num
356f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if dedup_ix is None:
357f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                dedup_next = next
358f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            else:
359f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]]
360f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            link_array[link_ix] = dedup_next.ix
361f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
362f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for i in range(n_trie):
363f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        #print((pat_array[i], link_array[i], ch_array[i]))
364f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i]
365f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        result.append(struct.pack('<I', packed))
366f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return b''.join(result)
367f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
368f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
369f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef generate_pattern(pats):
370f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    pat_array = [0]
371f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    patmap = {b'': 0}
372f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
373f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    raw_pat_array = []
374f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    raw_pat_size = 0
375f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    raw_patmap = {}
376f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
377f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for pat in pats:
378f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if pat is None:
379f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            continue
380f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        pat_str = pat_to_binary(pat)
381f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if pat_str not in patmap:
382f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            shift = 0
383f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            while shift < len(pat) and pat[len(pat) - shift - 1] == 0:
384f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                shift += 1
385f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            rawpat = pat_str[:len(pat) - shift]
386f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if rawpat not in raw_patmap:
387f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                raw_patmap[rawpat] = raw_pat_size
388f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                raw_pat_array.append(rawpat)
389f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                raw_pat_size += len(rawpat)
390f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat]
391f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            patmap[pat_str] = len(pat_array)
392f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            pat_array.append(data)
393f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size]
394f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    result = [struct.pack('<4I', *data)]
395f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for x in pat_array:
396f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        result.append(struct.pack('<I', x))
397f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    result.extend(raw_pat_array)
398f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return patmap, b''.join(result)
399f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
400f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
401f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef generate_hyb_file(hyph, ch_map, hyb_fn):
402f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    bfs = hyph.bfs(ch_map)
403f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    dedup_ix, dedup_nodes = hyph.dedup()
404f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    n_trie = hyph.pack(dedup_nodes, ch_map)
405f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    alphabet = generate_alphabet(ch_map)
406f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    patmap, pattern = generate_pattern([n.res for n in hyph.node_list])
407f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap)
408f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    header = generate_header(alphabet, trie, pattern)
409f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
410f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    with open(hyb_fn, 'wb') as f:
411f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        f.write(header)
412f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        f.write(alphabet)
413f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        f.write(trie)
414f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        f.write(pattern)
415f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
416f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
417f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# Verify that the file contains the same lines as the lines argument, in arbitrary order
418f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef verify_file_sorted(lines, fn):
419138b93f094584212dd6978a1822d078f93574022Raph Levien    file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')]
420f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    line_set = set(lines)
421f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    file_set = set(file_lines)
422f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if line_set == file_set:
423f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return True
424f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for line in line_set - file_set:
425f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        print(repr(line) + ' in reconstruction, not in file')
426f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for line in file_set - line_set:
427f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        print(repr(line) + ' in file, not in reconstruction')
428f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return False
429f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
430f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
431f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef map_to_chr(alphabet_map):
432f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    result = []
433f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    ch_map = {}
434f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for val in alphabet_map.values():
435f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        chs = [ch for ch in alphabet_map if alphabet_map[ch] == val]
436f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        # non-cased characters (like Ethopic) are in both, matching chr file
437f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        lowercase = [ch for ch in chs if not ch.isupper()]
438f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        uppercase = [ch for ch in chs if not ch.islower()]
439f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        # print(val, `lowercase`, `uppercase`)
440f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        assert len(lowercase) == 1, 'expected 1 lowercase character'
441f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character'
442f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        ch_map[val] = lowercase[0]
443f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        result.append(''.join(lowercase + uppercase))
444f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    ch_map[0] = '.'
445f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return (ch_map, result)
446f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
447f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
448f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef get_pattern(pattern_data, ix):
449f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    pattern_offset = struct.unpack('<I', pattern_data[8:12])[0]
450f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0]
451f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    pat_len = entry >> 26
452f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    pat_shift = (entry >> 20) & 0x1f
453f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    offset = pattern_offset + (entry & 0xfffff)
454f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift
455f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
456f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
457f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions):
458f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20])
459f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0]
460f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    pattern = node_entry >> pattern_shift
461f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if pattern:
462f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        result = []
463f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        is_exception = False
464f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        pat = get_pattern(pattern_data, pattern)
465f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for i in range(len(s) + 1):
466f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            pat_off = i - 1 + len(pat) - len(s)
467f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if pat_off < 0:
468f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                code = 0
469f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            else:
470f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                code = struct.unpack('B', pat[pat_off : pat_off + 1])[0]
471f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if 1 <= code <= 9:
472f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                result.append('%d' % code)
473f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            elif code == 10:
474f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                is_exception = True
475f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            elif code == 11:
476f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                result.append('-')
477f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                is_exception = True
478f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            else:
479f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                assert code == 0, 'unexpected code'
480f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if i < len(s):
481f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                result.append(s[i])
482f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        pat_str = ''.join(result)
483f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        #print(`pat_str`, `pat`)
484f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if is_exception:
485f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            assert pat_str[0] == '.', "expected leading '.'"
486f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            assert pat_str[-1] == '.', "expected trailing '.'"
487f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            exceptions.append(pat_str[1:-1])  # strip leading and trailing '.'
488f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        else:
489f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            patterns.append(pat_str)
490f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for ch in ch_map:
491f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0]
492f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        link = (edge_entry & link_mask) >> link_shift
493f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if link != 0 and ch == (edge_entry & char_mask):
494f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            sch = s + ch_map[ch]
495f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions)
496f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
497f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
498f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# Verify the generated binary file by reconstructing the textual representations
499f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# from the binary hyb file, then checking that they're identical (mod the order of
500f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# lines within the file, which is irrelevant). This function makes assumptions that
501f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# are stronger than absolutely necessary (in particular, that the patterns are in
502f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# lowercase as defined by python islower).
503f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn):
504f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    with open(hyb_fn, 'rb') as f:
505f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        hyb_data = f.read()
506f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    header = hyb_data[0: 6 * 4]
507f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header)
508f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    alphabet_data = hyb_data[alphabet_off:trie_off]
509f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    trie_data = hyb_data[trie_off:pattern_off]
510f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    pattern_data = hyb_data[pattern_off:file_size]
511f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
512f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    # reconstruct alphabet table
513f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    alphabet_version = struct.unpack('<I', alphabet_data[:4])[0]
514f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    alphabet_map = {}
515f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if alphabet_version == 0:
516f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12])
517f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for ch in range(min_ch, max_ch):
518f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            offset = 12 + ch - min_ch
519f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            b = struct.unpack('B', alphabet_data[offset : offset + 1])[0]
520f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if b != 0:
521f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                alphabet_map[unichr(ch)] = b
522f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    else:
523f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        assert alphabet_version == 1
524f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        n_entries = struct.unpack('<I', alphabet_data[4:8])[0]
525f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for i in range(n_entries):
526f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0]
527f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            alphabet_map[unichr(entry >> 11)] = entry & 0x7ff
528f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
529f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    ch_map, reconstructed_chr = map_to_chr(alphabet_map)
530f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
531f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587
532f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if u'\u0587' in reconstructed_chr:
533f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        reconstructed_chr.remove(u'\u0587')
534f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        reconstructed_chr.append(u'\u0587\u0535\u0552')
535f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
536f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified'
537f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
538f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    # reconstruct trie
539f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    patterns = []
540f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    exceptions = []
541f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions)
542f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified'
543f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified'
544f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
545f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
546f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef main():
547f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    global VERBOSE
548f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    try:
549f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        opts, args = getopt.getopt(sys.argv[1:], 'v')
550f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    except getopt.GetoptError as err:
551f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        print(str(err))
552f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        sys.exit(1)
553f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for o, _ in opts:
554f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        if o == '-v':
555f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            VERBOSE = True
556f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    pat_fn, out_fn = args
557f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    hyph = load(pat_fn)
558f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if pat_fn.endswith('.pat.txt'):
559f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        chr_fn = pat_fn[:-8] + '.chr.txt'
560f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        ch_map = load_chr(chr_fn)
561f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        hyp_fn = pat_fn[:-8] + '.hyp.txt'
562f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        load_hyp(hyph, hyp_fn)
563f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        generate_hyb_file(hyph, ch_map, out_fn)
564f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn)
565f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
566f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienif __name__ == '__main__':
567f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    main()
568