1#!/usr/bin/env python
2#
3# Copyright 2007 The Closure Linter Authors. All Rights Reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS-IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""Utilities for dealing with HTML."""
18
19__author__ = ('robbyw@google.com (Robert Walker)')
20
21import cStringIO
22import formatter
23import htmllib
24import HTMLParser
25import re
26
27
28class ScriptExtractor(htmllib.HTMLParser):
29  """Subclass of HTMLParser that extracts script contents from an HTML file.
30
31  Also inserts appropriate blank lines so that line numbers in the extracted
32  code match the line numbers in the original HTML.
33  """
34
35  def __init__(self):
36    """Initialize a ScriptExtractor."""
37    htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
38    self._in_script = False
39    self._text = ''
40
41  def start_script(self, attrs):
42    """Internal handler for the start of a script tag.
43
44    Args:
45      attrs: The attributes of the script tag, as a list of tuples.
46    """
47    for attribute in attrs:
48      if attribute[0].lower() == 'src':
49        # Skip script tags with a src specified.
50        return
51    self._in_script = True
52
53  def end_script(self):
54    """Internal handler for the end of a script tag."""
55    self._in_script = False
56
57  def handle_data(self, data):
58    """Internal handler for character data.
59
60    Args:
61      data: The character data from the HTML file.
62    """
63    if self._in_script:
64      # If the last line contains whitespace only, i.e. is just there to
65      # properly align a </script> tag, strip the whitespace.
66      if data.rstrip(' \t') != data.rstrip(' \t\n\r\f'):
67        data = data.rstrip(' \t')
68      self._text += data
69    else:
70      self._AppendNewlines(data)
71
72  def handle_comment(self, data):
73    """Internal handler for HTML comments.
74
75    Args:
76      data: The text of the comment.
77    """
78    self._AppendNewlines(data)
79
80  def _AppendNewlines(self, data):
81    """Count the number of newlines in the given string and append them.
82
83    This ensures line numbers are correct for reported errors.
84
85    Args:
86      data: The data to count newlines in.
87    """
88    # We append 'x' to both sides of the string to ensure that splitlines
89    # gives us an accurate count.
90    for i in xrange(len(('x' + data + 'x').splitlines()) - 1):
91      self._text += '\n'
92
93  def GetScriptLines(self):
94    """Return the extracted script lines.
95
96    Returns:
97      The extracted script lines as a list of strings.
98    """
99    return self._text.splitlines()
100
101
102def GetScriptLines(f):
103  """Extract script tag contents from the given HTML file.
104
105  Args:
106    f: The HTML file.
107
108  Returns:
109    Lines in the HTML file that are from script tags.
110  """
111  extractor = ScriptExtractor()
112
113  # The HTML parser chokes on text like Array.<!string>, so we patch
114  # that bug by replacing the < with &lt; - escaping all text inside script
115  # tags would be better but it's a bit of a catch 22.
116  contents = f.read()
117  contents = re.sub(r'<([^\s\w/])',
118         lambda x: '&lt;%s' % x.group(1),
119         contents)
120
121  extractor.feed(contents)
122  extractor.close()
123  return extractor.GetScriptLines()
124
125
126def StripTags(str):
127  """Returns the string with HTML tags stripped.
128
129  Args:
130    str: An html string.
131
132  Returns:
133    The html string with all tags stripped. If there was a parse error, returns
134    the text successfully parsed so far.
135  """
136  # Brute force approach to stripping as much HTML as possible. If there is a
137  # parsing error, don't strip text before parse error position, and continue
138  # trying from there.
139  final_text = ''
140  finished = False
141  while not finished:
142    try:
143      strip = _HtmlStripper()
144      strip.feed(str)
145      strip.close()
146      str = strip.get_output()
147      final_text += str
148      finished = True
149    except HTMLParser.HTMLParseError, e:
150      final_text += str[:e.offset]
151      str = str[e.offset + 1:]
152
153  return final_text
154
155
156class _HtmlStripper(HTMLParser.HTMLParser):
157  """Simple class to strip tags from HTML.
158
159  Does so by doing nothing when encountering tags, and appending character data
160  to a buffer when that is encountered.
161  """
162  def __init__(self):
163    self.reset()
164    self.__output = cStringIO.StringIO()
165
166  def handle_data(self, d):
167    self.__output.write(d)
168
169  def get_output(self):
170    return self.__output.getvalue()
171