1#!/usr/bin/python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Module for parsing TCG TPM2 library specification in HTML format.
7
8This module processes parts 2 and 3 of the specification, extracting
9information related to tables defined in the documents, feeding the
10information into the Table object for further processing and creating the
11appropriate TPM2 objects.
12"""
13
14from __future__ import print_function
15
16import HTMLParser
17import os
18import re
19import sys
20
21import tpm_table
22
23table_name = re.compile(r'^\s*Table\s+[0-9]+')
24
25
26class SpecParser(HTMLParser.HTMLParser):
27  """A class for parsing TCG specifications in html format."""
28
29  # The state machine of the parser could be in one of the following states.
30  ANCHOR = 0       # Look for table title anchor
31  TABLE_NAME = 1   # Look for table title in the data stream
32  TABLE_BODY = 2   # Scraping the actual table body
33  MAYBE_DONE = 3   # Could be over, unless a single spec table is split in
34                   # multiple HTML tables (to continue on the next page)
35  SKIP_HEADER = 4  # Ignore the header of the split tables
36
37  def __init__(self):
38    """Initialize a parser object to default state."""
39    HTMLParser.HTMLParser.__init__(self)
40    self._state = self.ANCHOR
41    self._title = ''
42    self._table = tpm_table.Table()
43    self._previous_table_number = 0  # Used to check if there are skipped tables
44
45  def _Normalize(self, data):
46    """Normalize HTML data.
47
48    HTML files generated from TCG specifications sometimes include utf8
49    characters (like long dashes), which appear only in comments/table titles
50    and can be safely ignored.
51
52    Args:
53     data: a string representing portion of data from the HTML being parsed.
54
55    Returns:
56      a string, the input data with characters above ASCII printable range
57                 excluded.
58    """
59    return ' ' + ''.join(x for x in self.unescape(data) if ord(x) < 128)
60
61  def GetTable(self):
62    """Return the Table object containing all information parsed so far."""
63    return self._table
64
65  def _SetState(self, new_state):
66    if self._state != new_state:
67      self._state = new_state
68      if new_state == self.TABLE_NAME:
69        self._title = ''
70
71  def handle_starttag(self, tag, attrs):
72    """Invoked each time a new HTML tag is opened.
73
74    This method drives changes in the parser FSM states, its heuristics are
75    derived from the format of the HTML files the TCG specs get converted to.
76
77    Each specification table is preceded with a tittle. The title is wrapped
78    in an anchor tag with a property 'name' set to 'bookmark#xxx. The title
79    text starts with ' Table [0-9]+ '. Once the table title is detected,
80    the state machine switches to looking for the actual HTML table, i.e. tags
81    'table', 'tr' and 'td' (the generated specs do not use the 'th' tags).
82
83    Large specification tables can be split into multiple HTML tables (so that
84    they fit in a page). This is why the presence of the closing 'table' tag
85    is not enough to close the parsing of the current specification table.
86
87    In some cases the next table is defined in the spec immediately after the
88    current one - this is when the new anchor tag is used as a signal that the
89    previous table has been completely consumed.
90
91    Args:
92      tag: a string, the HTML tag
93      attrs: a tuple of zero or more two-string tuples, the first element -
94             the HTML tag's attribute, the second element - the attribute
95             value.
96    """
97    if tag == 'a':
98      if [x for x in attrs if x[0] == 'name' and x[1].startswith('bookmark')]:
99        if self._state == self.ANCHOR:
100          self._SetState(self.TABLE_NAME)
101        elif self._state == self.MAYBE_DONE:
102          # Done indeed
103          self._table.ProcessTable()
104          self._table.Init()
105          self._SetState(self.TABLE_NAME)
106        elif self._state == self.TABLE_NAME:
107          self._title = ''
108    elif tag == 'p' and self._state == self.TABLE_NAME and not self._title:
109      # This was not a valid table start, back to looking for the right anchor.
110      self._SetState(self.ANCHOR)
111    elif self._state == self.TABLE_NAME and tag == 'table':
112      if not table_name.search(self._title):
113        # Table title does not match the expected format - back to square one.
114        self._SetState(self.ANCHOR)
115        return  # will have to start over
116      table_number = int(self._title.split()[1])
117      self._previous_table_number += 1
118      if table_number > self._previous_table_number:
119        print('Table(s) %s missing' % ' '.join(
120            '%d' % x for x in
121            range(self._previous_table_number, table_number)), file=sys.stderr)
122        self._previous_table_number = table_number
123      self._table.Init(self._title)
124      self._SetState(self.TABLE_BODY)
125    elif self._state == self.MAYBE_DONE and tag == 'tr':
126      self._SetState(self.SKIP_HEADER)
127    elif self._state == self.SKIP_HEADER and tag == 'tr':
128      self._SetState(self.TABLE_BODY)
129      self._table.NewRow()
130    elif self._state == self.TABLE_BODY:
131      if tag == 'tr':
132        self._table.NewRow()
133      elif tag == 'td':
134        self._table.NewCell()
135
136  def handle_endtag(self, tag):
137    """Invoked each time an HTML tag is closed."""
138    if tag == 'table' and self._table.InProgress():
139      self._SetState(self.MAYBE_DONE)
140
141  def handle_data(self, data):
142    """Process data outside HTML tags."""
143    if self._state == self.TABLE_NAME:
144      self._title += ' %s' % self._Normalize(data)
145    elif self._state == self.TABLE_BODY:
146      self._table.AddData(self._Normalize(data))
147    elif self._state == self.MAYBE_DONE:
148      # Done indeed
149      self._table.ProcessTable()
150      self._table.Init()
151      self._SetState(self.ANCHOR)
152
153  def close(self):
154    """Finish processing of the HTML buffer."""
155    if self._state in (self.TABLE_BODY, self.MAYBE_DONE):
156      self._table.ProcessTable()
157    self._state = self.ANCHOR
158
159  def handle_entityref(self, name):
160    """Process HTML escape sequence."""
161    entmap = {
162        'amp': '&',
163        'gt': '>',
164        'lt': '<',
165        'quot': '"',
166    }
167    if name in entmap:
168      if self._state == self.TABLE_BODY:
169        self._table.AddData(entmap[name])
170      elif self._state == self.TABLE_NAME:
171        self._title += entmap[name]
172
173
174def main(structs_html_file_name):
175  """When invoked standalone - dump .h file on the console."""
176  parser = SpecParser()
177  with open(structs_html_file_name) as input_file:
178    html_content = input_file.read()
179  parser.feed(html_content)
180  parser.close()
181  print(parser.GetTable().GetHFile())
182
183if __name__ == '__main__':
184  if len(sys.argv) != 2:
185    print('%s: One parameter is required, the name of the html file '
186          'which is the TPM2 library Part 2 specification' %
187          os.path.basename(sys.argv[0]), file=sys.stderr)
188    sys.exit(1)
189  main(sys.argv[1])
190