1f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien#!/usr/bin/env python 2f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 3f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# Copyright (C) 2015 The Android Open Source Project 4f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# 5f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# Licensed under the Apache License, Version 2.0 (the 'License'); 6f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# you may not use this file except in compliance with the License. 7f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# You may obtain a copy of the License at 8f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# 9f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# http://www.apache.org/licenses/LICENSE-2.0 10f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# 11f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# Unless required by applicable law or agreed to in writing, software 12f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# distributed under the License is distributed on an 'AS IS' BASIS, 13f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# See the License for the specific language governing permissions and 15f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# limitations under the License. 16f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 17f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien""" 18f0be43de02a1e07308d3d95408349c3c7f973430Raph LevienConvert hyphen files in standard TeX format (a trio of pat, chr, and hyp) 19f0be43de02a1e07308d3d95408349c3c7f973430Raph Levieninto binary format. See doc/hyb_file_format.md for more information. 20f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 21f0be43de02a1e07308d3d95408349c3c7f973430Raph LevienUsage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb 22f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 23f0be43de02a1e07308d3d95408349c3c7f973430Raph LevienOptional -v parameter turns on verbose debugging. 24f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 25f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien""" 26f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 27f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienfrom __future__ import print_function 28f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 29f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienimport io 30f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienimport sys 31f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienimport struct 32f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienimport math 33f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienimport getopt 34f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 35f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 36f0be43de02a1e07308d3d95408349c3c7f973430Raph LevienVERBOSE = False 37f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 38f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 39f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienif sys.version_info[0] >= 3: 40f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def unichr(x): 41f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return chr(x) 42f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 43f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 44f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# number of bits required to represent numbers up to n inclusive 45f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef num_bits(n): 46f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return 1 + int(math.log(n, 2)) if n > 0 else 0 47f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 48f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 49f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienclass Node: 50f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 51f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def __init__(self): 52f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.succ = {} 53f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.res = None 54f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.fsm_pat = None 55f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.fail = None 56f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 57f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 58f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# List of free slots, implemented as doubly linked list 59f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienclass Freelist: 60f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 61f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def __init__(self): 62f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.first = None 63f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.last = None 64f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.pred = [] 65f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.succ = [] 66f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 67f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def grow(self): 68f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien this = len(self.pred) 69f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.pred.append(self.last) 70f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.succ.append(None) 71f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if self.last is None: 72f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.first = this 73f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 74f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.succ[self.last] = this 75f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.last = this 76f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 77f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def next(self, cursor): 78f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if cursor == 0: 79f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien cursor = self.first 80f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if cursor is None: 81f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.grow() 82f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result = self.last 83f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 84f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result = cursor 85f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return result, self.succ[result] 86f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 87f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def is_free(self, ix): 88f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien while ix >= len(self.pred): 89f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.grow() 90f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return self.pred[ix] != -1 91f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 92f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def use(self, ix): 93f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if self.pred[ix] is None: 94f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.first = self.succ[ix] 95f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 96f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.succ[self.pred[ix]] = self.succ[ix] 97f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if self.succ[ix] is None: 98f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.last = self.pred[ix] 99f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 100f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.pred[self.succ[ix]] = self.pred[ix] 101f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if self.pred[ix] == -1: 102f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert self.pred[ix] != -1, 'double free!' 103f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.pred[ix] = -1 104f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 105f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 106f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef combine(a, b): 107f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if a is None: return b 108f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if b is None: return a 109f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if len(b) < len(a): a, b = b, a 110f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien res = b[:len(b) - len(a)] 111f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for i in range(len(a)): 112f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien res.append(max(a[i], b[i + len(b) - len(a)])) 113f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return res 114f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 115f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 116f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef trim(pattern): 117f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for ix in range(len(pattern)): 118f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if pattern[ix] != 0: 119f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return pattern[ix:] 120f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 121f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 122f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef pat_to_binary(pattern): 123f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return b''.join(struct.pack('B', x) for x in pattern) 124f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 125f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 126f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienclass Hyph: 127f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 128f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def __init__(self): 129f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.root = Node() 130f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.root.str = '<root>' 131f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.node_list = [self.root] 132f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 133f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # Add a pattern (word fragment with numeric codes, such as ".ad4der") 134f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def add_pat(self, pat): 135f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien lastWasLetter = False 136f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien haveSeenNumber = False 137f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result = [] 138f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien word = '' 139f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for c in pat: 140f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if c.isdigit(): 141f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append(int(c)) 142f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien lastWasLetter = False 143f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien haveSeenNumber = True 144f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 145f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien word += c 146f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if lastWasLetter and haveSeenNumber: 147f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append(0) 148f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien lastWasLetter = True 149f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if lastWasLetter: 150f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append(0) 151f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 152f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.add_word_res(word, result) 153f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 154f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # Add an exception (word with hyphens, such as "ta-ble") 155f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def add_exception(self, hyph_word): 156f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien res = [] 157f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien word = ['.'] 158f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien need_10 = False 159f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for c in hyph_word: 160f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if c == '-': 161f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien res.append(11) 162f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien need_10 = False 163f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 164f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if need_10: 165f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien res.append(10) 166f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien word.append(c) 167f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien need_10 = True 168f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien word.append('.') 169f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien res.append(0) 170f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien res.append(0) 171f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if VERBOSE: 172f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien print(word, res) 173f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.add_word_res(''.join(word), res) 174f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 175f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def add_word_res(self, word, result): 176f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if VERBOSE: 177f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien print(word, result) 178f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 179f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien t = self.root 180f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien s = '' 181f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for c in word: 182f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien s += c 183f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if c not in t.succ: 184f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien new_node = Node() 185f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien new_node.str = s 186f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.node_list.append(new_node) 187f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien t.succ[c] = new_node 188f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien t = t.succ[c] 189f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien t.res = result 190f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 191f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def pack(self, node_list, ch_map, use_node=False): 192f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien size = 0 193f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.node_map = {} 194f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien nodes = Freelist() 195f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien edges = Freelist() 196f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien edge_start = 1 if use_node else 0 197f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for node in node_list: 198f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()]) 199f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if len(succ): 200f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien cursor = 0 201f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien while True: 202f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien edge_ix, cursor = edges.next(cursor) 203f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ix = edge_ix - succ[0] 204f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if (ix >= 0 and nodes.is_free(ix) and 205f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien all(edges.is_free(ix + s) for s in succ) and 206f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ((not use_node) or edges.is_free(ix))): 207f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien break 208f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien elif use_node: 209f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ix, _ = edges.next(0) 210f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien nodes.is_free(ix) # actually don't need nodes at all when use_node, 211f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # but keep it happy 212f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 213f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ix, _ = nodes.next(0) 214f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien node.ix = ix 215f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.node_map[ix] = node 216f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien nodes.use(ix) 217f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien size = max(size, ix) 218f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if use_node: 219f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien edges.use(ix) 220f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for s in succ: 221f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien edges.use(ix + s) 222f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien size += max(ch_map.values()) + 1 223f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return size 224f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 225f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # return list of nodes in bfs order 226f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def bfs(self, ch_map): 227f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result = [self.root] 228f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ix = 0 229f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien while ix < len(result): 230f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien node = result[ix] 231f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien node.bfs_ix = ix 232f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien mapped = {} 233f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for c, next in node.succ.items(): 234f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c)) 235f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien mapped[ch_map[c]] = next 236f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for i in sorted(mapped.keys()): 237f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append(mapped[i]) 238f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ix += 1 239f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien self.bfs_order = result 240f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return result 241f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 242f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # suffix compression - convert the trie into an acyclic digraph, merging nodes when 243f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # the subtries are identical 244f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien def dedup(self): 245f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uniques = [] 246f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien dupmap = {} 247f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien dedup_ix = [0] * len(self.bfs_order) 248f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for ix in reversed(range(len(self.bfs_order))): 249f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # construct string representation of node 250f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien node = self.bfs_order[ix] 251f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if node.res is None: 252f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien s = '' 253f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 254f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien s = ''.join(str(c) for c in node.res) 255f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for c in sorted(node.succ.keys()): 256f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien succ = node.succ[c] 257f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien s += ' ' + c + str(dedup_ix[succ.bfs_ix]) 258f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if s in dupmap: 259f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien dedup_ix[ix] = dupmap[s] 260f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 261f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uniques.append(node) 262f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien dedup_ix[ix] = ix 263f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien dupmap[s] = dedup_ix[ix] 264f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uniques.reverse() 265f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total') 266f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return dedup_ix, uniques 267f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 268f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 269f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# load the ".pat" file, which contains patterns such as a1b2c3 270f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef load(fn): 271f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien hyph = Hyph() 272f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien with io.open(fn, encoding='UTF-8') as f: 273f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for l in f: 274f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat = l.strip() 275f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien hyph.add_pat(pat) 276f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return hyph 277f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 278f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 279f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc. 280f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef load_chr(fn): 281f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ch_map = {'.': 0} 282f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien with io.open(fn, encoding='UTF-8') as f: 283f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for i, l in enumerate(f): 284f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien l = l.strip() 285f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if len(l) > 2: 286f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # lowercase maps to multi-character uppercase sequence, ignore uppercase for now 287f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien l = l[:1] 288f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 289f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert len(l) == 2, 'expected 2 chars in chr' 290f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for c in l: 291f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ch_map[c] = i + 1 292f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return ch_map 293f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 294f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 295f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# load exceptions with explicit hyphens 296f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef load_hyp(hyph, fn): 297f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien with io.open(fn, encoding='UTF-8') as f: 298f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for l in f: 299f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien hyph.add_exception(l.strip()) 300f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 301f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 302f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef generate_header(alphabet, trie, pattern): 303f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alphabet_off = 6 * 4 304f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien trie_off = alphabet_off + len(alphabet) 305f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pattern_off = trie_off + len(trie) 306f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien file_size = pattern_off + len(pattern) 307f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size] 308f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return struct.pack('<6I', *data) 309f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 310f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 311f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef generate_alphabet(ch_map): 312f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ch_map = ch_map.copy() 313f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien del ch_map['.'] 314f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien min_ch = ord(min(ch_map)) 315f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien max_ch = ord(max(ch_map)) 316f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if max_ch - min_ch < 1024 and max(ch_map.values()) < 256: 317f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # generate format 0 318f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien data = [0] * (max_ch - min_ch + 1) 319f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for c, val in ch_map.items(): 320f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien data[ord(c) - min_ch] = val 321f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result = [struct.pack('<3I', 0, min_ch, max_ch + 1)] 322f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for b in data: 323f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append(struct.pack('<B', b)) 324f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 325f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # generate format 1 326f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded' 327f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result = [struct.pack('<2I', 1, len(ch_map))] 328f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for c, val in sorted(ch_map.items()): 329f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien data = (ord(c) << 11) | val 330f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append(struct.pack('<I', data)) 331f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien binary = b''.join(result) 332f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if len(binary) % 4 != 0: 333f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien binary += b'\x00' * (4 - len(binary) % 4) 334f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return binary 335f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 336f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 337f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# assumes hyph structure has been packed, ie node.ix values have been set 338f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap): 339f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ch_array = [0] * n_trie 340f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien link_array = [0] * n_trie 341f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat_array = [0] * n_trie 342f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien link_shift = num_bits(max(ch_map.values())) 343f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien char_mask = (1 << link_shift) - 1 344f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pattern_shift = link_shift + num_bits(n_trie - 1) 345f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien link_mask = (1 << pattern_shift) - (1 << link_shift) 346f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)] 347f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 348f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for node in dedup_nodes: 349f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ix = node.ix 350f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if node.res is not None: 351f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat_array[ix] = patmap[pat_to_binary(node.res)] 352f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for c, next in node.succ.items(): 353f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien c_num = ch_map[c] 354f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien link_ix = ix + c_num 355f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ch_array[link_ix] = c_num 356f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if dedup_ix is None: 357f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien dedup_next = next 358f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 359f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]] 360f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien link_array[link_ix] = dedup_next.ix 361f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 362f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for i in range(n_trie): 363f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien #print((pat_array[i], link_array[i], ch_array[i])) 364f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i] 365f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append(struct.pack('<I', packed)) 366f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return b''.join(result) 367f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 368f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 369f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef generate_pattern(pats): 370f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat_array = [0] 371f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien patmap = {b'': 0} 372f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 373f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien raw_pat_array = [] 374f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien raw_pat_size = 0 375f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien raw_patmap = {} 376f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 377f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for pat in pats: 378f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if pat is None: 379f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien continue 380f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat_str = pat_to_binary(pat) 381f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if pat_str not in patmap: 382f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien shift = 0 383f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien while shift < len(pat) and pat[len(pat) - shift - 1] == 0: 384f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien shift += 1 385f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien rawpat = pat_str[:len(pat) - shift] 386f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if rawpat not in raw_patmap: 387f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien raw_patmap[rawpat] = raw_pat_size 388f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien raw_pat_array.append(rawpat) 389f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien raw_pat_size += len(rawpat) 390f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat] 391f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien patmap[pat_str] = len(pat_array) 392f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat_array.append(data) 393f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size] 394f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result = [struct.pack('<4I', *data)] 395f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for x in pat_array: 396f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append(struct.pack('<I', x)) 397f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.extend(raw_pat_array) 398f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return patmap, b''.join(result) 399f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 400f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 401f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef generate_hyb_file(hyph, ch_map, hyb_fn): 402f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien bfs = hyph.bfs(ch_map) 403f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien dedup_ix, dedup_nodes = hyph.dedup() 404f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien n_trie = hyph.pack(dedup_nodes, ch_map) 405f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alphabet = generate_alphabet(ch_map) 406f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien patmap, pattern = generate_pattern([n.res for n in hyph.node_list]) 407f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap) 408f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien header = generate_header(alphabet, trie, pattern) 409f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 410f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien with open(hyb_fn, 'wb') as f: 411f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien f.write(header) 412f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien f.write(alphabet) 413f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien f.write(trie) 414f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien f.write(pattern) 415f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 416f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 417f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# Verify that the file contains the same lines as the lines argument, in arbitrary order 418f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef verify_file_sorted(lines, fn): 419138b93f094584212dd6978a1822d078f93574022Raph Levien file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')] 420f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien line_set = set(lines) 421f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien file_set = set(file_lines) 422f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if line_set == file_set: 423f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return True 424f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for line in line_set - file_set: 425f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien print(repr(line) + ' in reconstruction, not in file') 426f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for line in file_set - line_set: 427f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien print(repr(line) + ' in file, not in reconstruction') 428f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return False 429f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 430f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 431f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef map_to_chr(alphabet_map): 432f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result = [] 433f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ch_map = {} 434f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for val in alphabet_map.values(): 435f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien chs = [ch for ch in alphabet_map if alphabet_map[ch] == val] 436f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # non-cased characters (like Ethopic) are in both, matching chr file 437f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien lowercase = [ch for ch in chs if not ch.isupper()] 438f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uppercase = [ch for ch in chs if not ch.islower()] 439f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # print(val, `lowercase`, `uppercase`) 440f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert len(lowercase) == 1, 'expected 1 lowercase character' 441f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character' 442f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ch_map[val] = lowercase[0] 443f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append(''.join(lowercase + uppercase)) 444f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ch_map[0] = '.' 445f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return (ch_map, result) 446f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 447f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 448f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef get_pattern(pattern_data, ix): 449f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pattern_offset = struct.unpack('<I', pattern_data[8:12])[0] 450f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0] 451f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat_len = entry >> 26 452f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat_shift = (entry >> 20) & 0x1f 453f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien offset = pattern_offset + (entry & 0xfffff) 454f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift 455f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 456f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 457f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions): 458f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20]) 459f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0] 460f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pattern = node_entry >> pattern_shift 461f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if pattern: 462f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result = [] 463f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien is_exception = False 464f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat = get_pattern(pattern_data, pattern) 465f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for i in range(len(s) + 1): 466f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat_off = i - 1 + len(pat) - len(s) 467f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if pat_off < 0: 468f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien code = 0 469f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 470f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien code = struct.unpack('B', pat[pat_off : pat_off + 1])[0] 471f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if 1 <= code <= 9: 472f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append('%d' % code) 473f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien elif code == 10: 474f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien is_exception = True 475f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien elif code == 11: 476f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append('-') 477f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien is_exception = True 478f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 479f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert code == 0, 'unexpected code' 480f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if i < len(s): 481f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result.append(s[i]) 482f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat_str = ''.join(result) 483f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien #print(`pat_str`, `pat`) 484f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if is_exception: 485f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert pat_str[0] == '.', "expected leading '.'" 486f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert pat_str[-1] == '.', "expected trailing '.'" 487f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien exceptions.append(pat_str[1:-1]) # strip leading and trailing '.' 488f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 489f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien patterns.append(pat_str) 490f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for ch in ch_map: 491f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0] 492f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien link = (edge_entry & link_mask) >> link_shift 493f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if link != 0 and ch == (edge_entry & char_mask): 494f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien sch = s + ch_map[ch] 495f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions) 496f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 497f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 498f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# Verify the generated binary file by reconstructing the textual representations 499f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# from the binary hyb file, then checking that they're identical (mod the order of 500f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# lines within the file, which is irrelevant). This function makes assumptions that 501f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# are stronger than absolutely necessary (in particular, that the patterns are in 502f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien# lowercase as defined by python islower). 503f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn): 504f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien with open(hyb_fn, 'rb') as f: 505f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien hyb_data = f.read() 506f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien header = hyb_data[0: 6 * 4] 507f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header) 508f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alphabet_data = hyb_data[alphabet_off:trie_off] 509f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien trie_data = hyb_data[trie_off:pattern_off] 510f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pattern_data = hyb_data[pattern_off:file_size] 511f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 512f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # reconstruct alphabet table 513f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alphabet_version = struct.unpack('<I', alphabet_data[:4])[0] 514f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alphabet_map = {} 515f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if alphabet_version == 0: 516f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12]) 517f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for ch in range(min_ch, max_ch): 518f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien offset = 12 + ch - min_ch 519f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien b = struct.unpack('B', alphabet_data[offset : offset + 1])[0] 520f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if b != 0: 521f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alphabet_map[unichr(ch)] = b 522f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien else: 523f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert alphabet_version == 1 524f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien n_entries = struct.unpack('<I', alphabet_data[4:8])[0] 525f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for i in range(n_entries): 526f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0] 527f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alphabet_map[unichr(entry >> 11)] = entry & 0x7ff 528f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 529f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ch_map, reconstructed_chr = map_to_chr(alphabet_map) 530f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 531f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587 532f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if u'\u0587' in reconstructed_chr: 533f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien reconstructed_chr.remove(u'\u0587') 534f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien reconstructed_chr.append(u'\u0587\u0535\u0552') 535f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 536f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified' 537f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 538f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien # reconstruct trie 539f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien patterns = [] 540f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien exceptions = [] 541f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions) 542f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified' 543f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified' 544f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 545f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 546f0be43de02a1e07308d3d95408349c3c7f973430Raph Leviendef main(): 547f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien global VERBOSE 548f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien try: 549f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien opts, args = getopt.getopt(sys.argv[1:], 'v') 550f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien except getopt.GetoptError as err: 551f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien print(str(err)) 552f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien sys.exit(1) 553f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for o, _ in opts: 554f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if o == '-v': 555f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien VERBOSE = True 556f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien pat_fn, out_fn = args 557f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien hyph = load(pat_fn) 558f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if pat_fn.endswith('.pat.txt'): 559f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien chr_fn = pat_fn[:-8] + '.chr.txt' 560f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien ch_map = load_chr(chr_fn) 561f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien hyp_fn = pat_fn[:-8] + '.hyp.txt' 562f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien load_hyp(hyph, hyp_fn) 563f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien generate_hyb_file(hyph, ch_map, out_fn) 564f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn) 565f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 566f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienif __name__ == '__main__': 567f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien main() 568