parse_html_deps.py revision b2cbf1594f8d6e4ba32d384cf379f62a74ed7654
1# Copyright (c) 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import os
6import sys
7
8from tvcm import module
9from tvcm import strip_js_comments
10from tvcm import html_generation_controller
11
12
13def _InitBeautifulSoup():
14  tvcm_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
15  bs_path = os.path.join(tvcm_path, 'third_party', 'beautifulsoup')
16  if bs_path in sys.path:
17    return
18  sys.path.insert(0, bs_path)
19
20
21_InitBeautifulSoup()
22import BeautifulSoup
23import polymer_soup
24
25
26class InlineScript(object):
27  def __init__(self, soup):
28    if not soup:
29      raise module.DepsException('InlineScript created without soup')
30    self._soup = soup
31    self._stripped_contents = None
32    self._open_tags = None
33
34  @property
35  def contents(self):
36    #TODO(nednguyen): change other places to use unicode() instead of str().
37    return unicode(self._soup.string)
38
39  @property
40  def stripped_contents(self):
41    if not self._stripped_contents:
42      self._stripped_contents = strip_js_comments.StripJSComments(
43          self.contents)
44    return self._stripped_contents
45
46  @property
47  def open_tags(self):
48    if self._open_tags:
49      return self._open_tags
50    open_tags = []
51    cur = self._soup.parent
52    while cur:
53      if isinstance(cur, BeautifulSoup.BeautifulSoup):
54        break
55
56      open_tags.append(_Tag(cur.name, cur.attrs))
57      cur = cur.parent
58
59    open_tags.reverse()
60    assert open_tags[-1].tag == 'script'
61    del open_tags[-1]
62
63    self._open_tags = open_tags
64    return self._open_tags
65
66
67def _IsDoctype(x):
68  if not isinstance(x, BeautifulSoup.Declaration):
69    return False
70  return x == 'DOCTYPE html' or x == 'DOCTYPE HTML'
71
72
73class HTMLModuleParserResults(object):
74  def __init__(self, html):
75    self._soup = polymer_soup.PolymerSoup(html)
76    self._inline_scripts = None
77
78  @property
79  def has_decl(self):
80    decls = [x for x in self._soup.contents
81             if _IsDoctype(x)]
82    return len(decls) == 1
83
84  @property
85  def scripts_external(self):
86    tags = self._soup.findAll('script', src=True)
87    return [t['src'] for t in tags]
88
89  @property
90  def inline_scripts(self):
91    if not self._inline_scripts:
92      tags = self._soup.findAll('script', src=None)
93      self._inline_scripts = [InlineScript(t.string) for t in tags]
94    return self._inline_scripts
95
96  @property
97  def imports(self):
98    tags = self._soup.findAll('link', rel='import')
99    return [t['href'] for t in tags]
100
101  @property
102  def stylesheets(self):
103    tags = self._soup.findAll('link', rel='stylesheet')
104    return [t['href'] for t in tags]
105
106  @property
107  def inline_stylesheets(self):
108    tags = self._soup.findAll('style')
109    return [str(t.string) for t in tags]
110
111  def YieldHTMLInPieces(self, controller, minify=False):
112    yield self.GenerateHTML(controller, minify)
113
114  def GenerateHTML(self, controller, minify=False):
115    soup = polymer_soup.PolymerSoup(str(self._soup))
116
117    # Remove declaration.
118    for x in soup.contents:
119      if isinstance(x, BeautifulSoup.Declaration):
120        if _IsDoctype(x):
121          x.extract()
122
123    # Remove all imports.
124    imports = soup.findAll('link', rel='import')
125    for imp in imports:
126      imp.extract()
127
128    # Remove all script links.
129    scripts_external = soup.findAll('script', src=True)
130    for script in scripts_external:
131      script.extract()
132
133    # Remove all in-line scripts.
134    scripts_external = soup.findAll('script', src=None)
135    for script in scripts_external:
136      script.extract()
137
138    # Process all in-line styles.
139    inline_styles = soup.findAll('style')
140    for style in inline_styles:
141      html = controller.GetHTMLForInlineStylesheet(str(style.string))
142      if html:
143        ns = BeautifulSoup.Tag(soup, 'style')
144        ns.append(BeautifulSoup.NavigableString(html))
145        style.replaceWith(ns)
146      else:
147        style.extract()
148
149    # Rewrite all external stylesheet hrefs or remove, as needed.
150    stylesheet_links = soup.findAll('link', rel='stylesheet')
151    for stylesheet_link in stylesheet_links:
152      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
153      if html:
154        tmp = polymer_soup.PolymerSoup(html).findChildren()
155        assert len(tmp) == 1
156        stylesheet_link.replaceWith(tmp[0])
157      else:
158        stylesheet_link.extract()
159
160    # Remove comments if minifying.
161    if minify:
162      comments = soup.findAll(
163          text=lambda text: isinstance(text, BeautifulSoup.Comment))
164      for comment in comments:
165        comment.extract()
166
167    # We are done.
168    return str(soup).strip()
169
170  @property
171  def html_contents_without_links_and_script(self):
172    return self.GenerateHTML(
173        html_generation_controller.HTMLGenerationController())
174
175
176class _Tag(object):
177
178  def __init__(self, tag, attrs):
179    self.tag = tag
180    self.attrs = attrs
181
182  def __repr__(self):
183    attr_string = ' '.join(['%s="%s"' % (x[0], x[1]) for x in self.attrs])
184    return '<%s %s>' % (self.tag, attr_string)
185
186
187class HTMLModuleParser():
188
189  def Parse(self, html):
190    if html is None:
191      html = ''
192    else:
193      if html.find('< /script>') != -1:
194        raise Exception('Escape script tags with <\/script>')
195
196    return HTMLModuleParserResults(html)
197