1#!/usr/bin/env python
2# Copyright 2014 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""
7A Deterministic acyclic finite state automaton (DAFSA) is a compact
8representation of an unordered word list (dictionary).
9
10http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
11
12This python program converts a list of strings to a byte array in C++.
13This python program fetches strings and return values from a gperf file
14and generates a C++ file with a byte array representing graph that can be
15used as a memory efficient replacement for the perfect hash table.
16
17The input strings are assumed to consist of printable 7-bit ASCII characters
18and the return values are assumed to be one digit integers.
19
20In this program a DAFSA is a diamond shaped graph starting at a common
21source node and ending at a common sink node. All internal nodes contain
22a label and each word is represented by the labels in one path from
23the source node to the sink node.
24
25The following python represention is used for nodes:
26
27  Source node: [ children ]
28  Internal node: (label, [ children ])
29  Sink node: None
30
31The graph is first compressed by prefixes like a trie. In the next step
32suffixes are compressed so that the graph gets diamond shaped. Finally
33one to one linked nodes are replaced by nodes with the labels joined.
34
35The order of the operations is crucial since lookups will be performed
36starting from the source with no backtracking. Thus a node must have at
37most one child with a label starting by the same character. The output
38is also arranged so that all jumps are to increasing addresses, thus forward
39in memory.
40
41The generated output has suffix free decoding so that the sign of leading
42bits in a link (a reference to a child node) indicate if it has a size of one,
43two or three bytes and if it is the last outgoing link from the actual node.
44A node label is terminated by a byte with the leading bit set.
45
46The generated byte array can described by the following BNF:
47
48<byte> ::= < 8-bit value in range [0x00-0xFF] >
49
50<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
51<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
52<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
53
54<offset1> ::= < byte in range [0x00-0x3F] >
55<offset2> ::= < byte in range [0x40-0x5F] >
56<offset3> ::= < byte in range [0x60-0x7F] >
57
58<end_offset1> ::= < byte in range [0x80-0xBF] >
59<end_offset2> ::= < byte in range [0xC0-0xDF] >
60<end_offset3> ::= < byte in range [0xE0-0xFF] >
61
62<prefix> ::= <char>
63
64<label> ::= <end_char>
65          | <char> <label>
66
67<end_label> ::= <return_value>
68          | <char> <end_label>
69
70<offset> ::= <offset1>
71           | <offset2> <byte>
72           | <offset3> <byte> <byte>
73
74<end_offset> ::= <end_offset1>
75               | <end_offset2> <byte>
76               | <end_offset3> <byte> <byte>
77
78<offsets> ::= <end_offset>
79            | <offset> <offsets>
80
81<source> ::= <offsets>
82
83<node> ::= <label> <offsets>
84         | <prefix> <node>
85         | <end_label>
86
87<dafsa> ::= <source>
88          | <dafsa> <node>
89
90Decoding:
91
92<char> -> printable 7-bit ASCII character
93<end_char> & 0x7F -> printable 7-bit ASCII character
94<return value> & 0x0F -> integer
95<offset1 & 0x3F> -> integer
96((<offset2> & 0x1F>) << 8) + <byte> -> integer
97((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
98
99end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
100offset2 and offset3 respectively.
101
102The first offset in a list of offsets is the distance in bytes between the
103offset itself and the first child node. Subsequent offsets are the distance
104between previous child node and next child node. Thus each offset links a node
105to a child node. The distance is always counted between start addresses, i.e.
106first byte in decoded offset or first byte in child node.
107
108Example 1:
109
110%%
111aa, 1
112a, 2
113%%
114
115The input is first parsed to a list of words:
116["aa1", "a2"]
117
118A fully expanded graph is created from the words:
119source = [node1, node4]
120node1 = ("a", [node2])
121node2 = ("a", [node3])
122node3 = ("\x01", [sink])
123node4 = ("a", [node5])
124node5 = ("\x02", [sink])
125sink = None
126
127Compression results in the following graph:
128source = [node1]
129node1 = ("a", [node2, node3])
130node2 = ("\x02", [sink])
131node3 = ("a\x01", [sink])
132sink = None
133
134A C++ representation of the compressed graph is generated:
135
136const unsigned char dafsa[7] = {
137  0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
138};
139
140The bytes in the generated array has the following meaning:
141
142 0: 0x81 <end_offset1>  child at position 0 + (0x81 & 0x3F) -> jump to 1
143
144 1: 0xE1 <end_char>     label character (0xE1 & 0x7F) -> match "a"
145 2: 0x02 <offset1>      child at position 2 + (0x02 & 0x3F) -> jump to 4
146
147 3: 0x81 <end_offset1>  child at position 4 + (0x81 & 0x3F) -> jump to 5
148 4: 0x82 <return_value> 0x82 & 0x0F -> return 2
149
150 5: 0x61 <char>         label character 0x61 -> match "a"
151 6: 0x81 <return_value> 0x81 & 0x0F -> return 1
152
153Example 2:
154
155%%
156aa, 1
157bbb, 2
158baa, 1
159%%
160
161The input is first parsed to a list of words:
162["aa1", "bbb2", "baa1"]
163
164Compression results in the following graph:
165source = [node1, node2]
166node1 = ("b", [node2, node3])
167node2 = ("aa\x01", [sink])
168node3 = ("bb\x02", [sink])
169sink = None
170
171A C++ representation of the compressed graph is generated:
172
173const unsigned char dafsa[11] = {
174  0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
175};
176
177The bytes in the generated array has the following meaning:
178
179 0: 0x02 <offset1>      child at position 0 + (0x02 & 0x3F) -> jump to 2
180 1: 0x83 <end_offset1>  child at position 2 + (0x83 & 0x3F) -> jump to 5
181
182 2: 0xE2 <end_char>     label character (0xE2 & 0x7F) -> match "b"
183 3: 0x02 <offset1>      child at position 3 + (0x02 & 0x3F) -> jump to 5
184 4: 0x83 <end_offset1>  child at position 5 + (0x83 & 0x3F) -> jump to 8
185
186 5: 0x61 <char>         label character 0x61 -> match "a"
187 6: 0x61 <char>         label character 0x61 -> match "a"
188 7: 0x81 <return_value> 0x81 & 0x0F -> return 1
189
190 8: 0x62 <char>         label character 0x62 -> match "b"
191 9: 0x62 <char>         label character 0x62 -> match "b"
19210: 0x82 <return_value> 0x82 & 0x0F -> return 2
193"""
194
195import sys
196
197class InputError(Exception):
198  """Exception raised for errors in the input file."""
199
200
201def to_dafsa(words):
202  """Generates a DAFSA from a word list and returns the source node.
203
204  Each word is split into characters so that each character is represented by
205  a unique node. It is assumed the word list is not empty.
206  """
207  if not words:
208    raise InputError('The domain list must not be empty')
209  def ToNodes(word):
210    """Split words into characters"""
211    if not 0x1F < ord(word[0]) < 0x80:
212      raise InputError('Domain names must be printable 7-bit ASCII')
213    if len(word) == 1:
214      return chr(ord(word[0]) & 0x0F), [None]
215    return word[0], [ToNodes(word[1:])]
216  return [ToNodes(word) for word in words]
217
218
219def to_words(node):
220  """Generates a word list from all paths starting from an internal node."""
221  if not node:
222    return ['']
223  return [(node[0] + word) for child in node[1] for word in to_words(child)]
224
225
226def reverse(dafsa):
227  """Generates a new DAFSA that is reversed, so that the old sink node becomes
228  the new source node.
229  """
230  sink = []
231  nodemap = {}
232
233  def dfs(node, parent):
234    """Creates reverse nodes.
235
236    A new reverse node will be created for each old node. The new node will
237    get a reversed label and the parents of the old node as children.
238    """
239    if not node:
240      sink.append(parent)
241    elif id(node) not in nodemap:
242      nodemap[id(node)] = (node[0][::-1], [parent])
243      for child in node[1]:
244        dfs(child, nodemap[id(node)])
245    else:
246      nodemap[id(node)][1].append(parent)
247
248  for node in dafsa:
249    dfs(node, None)
250  return sink
251
252
253def join_labels(dafsa):
254  """Generates a new DAFSA where internal nodes are merged if there is a one to
255  one connection.
256  """
257  parentcount = { id(None): 2 }
258  nodemap = { id(None): None }
259
260  def count_parents(node):
261    """Count incoming references"""
262    if id(node) in parentcount:
263      parentcount[id(node)] += 1
264    else:
265      parentcount[id(node)] = 1
266      for child in node[1]:
267        count_parents(child)
268
269  def join(node):
270    """Create new nodes"""
271    if id(node) not in nodemap:
272      children = [join(child) for child in node[1]]
273      if len(children) == 1 and parentcount[id(node[1][0])] == 1:
274        child = children[0]
275        nodemap[id(node)] = (node[0] + child[0], child[1])
276      else:
277        nodemap[id(node)] = (node[0], children)
278    return nodemap[id(node)]
279
280  for node in dafsa:
281    count_parents(node)
282  return [join(node) for node in dafsa]
283
284
285def join_suffixes(dafsa):
286  """Generates a new DAFSA where nodes that represent the same word lists
287  towards the sink are merged.
288  """
289  nodemap = { frozenset(('',)): None }
290
291  def join(node):
292    """Returns a macthing node. A new node is created if no matching node
293    exists. The graph is accessed in dfs order.
294    """
295    suffixes = frozenset(to_words(node))
296    if suffixes not in nodemap:
297      nodemap[suffixes] = (node[0], [join(child) for child in node[1]])
298    return nodemap[suffixes]
299
300  return [join(node) for node in dafsa]
301
302
303def top_sort(dafsa):
304  """Generates list of nodes in topological sort order."""
305  incoming = {}
306
307  def count_incoming(node):
308    """Counts incoming references."""
309    if node:
310      if id(node) not in incoming:
311        incoming[id(node)] = 1
312        for child in node[1]:
313          count_incoming(child)
314      else:
315        incoming[id(node)] += 1
316
317  for node in dafsa:
318    count_incoming(node)
319
320  for node in dafsa:
321    incoming[id(node)] -= 1
322
323  waiting = [node for node in dafsa if incoming[id(node)] == 0]
324  nodes = []
325
326  while waiting:
327    node = waiting.pop()
328    assert incoming[id(node)] == 0
329    nodes.append(node)
330    for child in node[1]:
331      if child:
332        incoming[id(child)] -= 1
333        if incoming[id(child)] == 0:
334          waiting.append(child)
335  return nodes
336
337
338def encode_links(children, offsets, current):
339  """Encodes a list of children as one, two or three byte offsets."""
340  if not children[0]:
341    # This is an <end_label> node and no links follow such nodes
342    assert len(children) == 1
343    return []
344  guess = 3 * len(children)
345  assert children
346  children = sorted(children, key = lambda x: -offsets[id(x)])
347  while True:
348    offset = current + guess
349    buf = []
350    for child in children:
351      last = len(buf)
352      distance = offset - offsets[id(child)]
353      assert distance > 0 and distance < (1 << 21)
354
355      if distance < (1 << 6):
356        # A 6-bit offset: "s0xxxxxx"
357        buf.append(distance)
358      elif distance < (1 << 13):
359        # A 13-bit offset: "s10xxxxxxxxxxxxx"
360        buf.append(0x40 | (distance >> 8))
361        buf.append(distance & 0xFF)
362      else:
363        # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
364        buf.append(0x60 | (distance >> 16))
365        buf.append((distance >> 8) & 0xFF)
366        buf.append(distance & 0xFF)
367      # Distance in first link is relative to following record.
368      # Distance in other links are relative to previous link.
369      offset -= distance
370    if len(buf) == guess:
371      break
372    guess = len(buf)
373  # Set most significant bit to mark end of links in this node.
374  buf[last] |= (1 << 7)
375  buf.reverse()
376  return buf
377
378
379def encode_prefix(label):
380  """Encodes a node label as a list of bytes without a trailing high byte.
381
382  This method encodes a node if there is exactly one child  and the
383  child follows immidiately after so that no jump is needed. This label
384  will then be a prefix to the label in the child node.
385  """
386  assert label
387  return [ord(c) for c in reversed(label)]
388
389
390def encode_label(label):
391  """Encodes a node label as a list of bytes with a trailing high byte >0x80.
392  """
393  buf = encode_prefix(label)
394  # Set most significant bit to mark end of label in this node.
395  buf[0] |= (1 << 7)
396  return buf
397
398
399def encode(dafsa):
400  """Encodes a DAFSA to a list of bytes"""
401  output = []
402  offsets = {}
403
404  for node in reversed(top_sort(dafsa)):
405    if (len(node[1]) == 1 and node[1][0] and
406        (offsets[id(node[1][0])] == len(output))):
407      output.extend(encode_prefix(node[0]))
408    else:
409      output.extend(encode_links(node[1], offsets, len(output)))
410      output.extend(encode_label(node[0]))
411    offsets[id(node)] = len(output)
412
413  output.extend(encode_links(dafsa, offsets, len(output)))
414  output.reverse()
415  return output
416
417
418def to_cxx(data):
419  """Generates C++ code from a list of encoded bytes."""
420  text = '/* This file is generated. DO NOT EDIT!\n\n'
421  text += 'The byte array encodes effective tld names. See make_dafsa.py for'
422  text += ' documentation.'
423  text += '*/\n\n'
424  text += 'const unsigned char kDafsa[%s] = {\n' % len(data)
425  for i in range(0, len(data), 12):
426    text += '  '
427    text += ', '.join('0x%02x' % byte for byte in data[i:i + 12])
428    text += ',\n'
429  text += '};\n'
430  return text
431
432
433def words_to_cxx(words):
434  """Generates C++ code from a word list"""
435  dafsa = to_dafsa(words)
436  for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels):
437    dafsa = fun(dafsa)
438  return to_cxx(encode(dafsa))
439
440
441def parse_gperf(infile):
442  """Parses gperf file and extract strings and return code"""
443  lines = [line.strip() for line in infile]
444  # Extract strings after the first '%%' and before the second '%%'.
445  begin = lines.index('%%') + 1
446  end = lines.index('%%', begin)
447  lines = lines[begin:end]
448  for line in lines:
449    if line[-3:-1] != ', ':
450      raise InputError('Expected "domainname, <digit>", found "%s"' % line)
451    # Technically the DAFSA format could support return values in range [0-31],
452    # but the values below are the only with a defined meaning.
453    if line[-1] not in '0124':
454      raise InputError('Expected value to be one of {0,1,2,4}, found "%s"' %
455                       line[-1])
456  return [line[:-3] + line[-1] for line in lines]
457
458
459def main():
460  if len(sys.argv) != 3:
461    print('usage: %s infile outfile' % sys.argv[0])
462    return 1
463  with open(sys.argv[1], 'r') as infile, open(sys.argv[2], 'w') as outfile:
464    outfile.write(words_to_cxx(parse_gperf(infile)))
465  return 0
466
467
468if __name__ == '__main__':
469  sys.exit(main())
470