1#!/usr/bin/env python
2# Copyright (c) 2011 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6
7"""Extracts registration forms from the corresponding HTML files.
8
9Used for extracting forms within HTML files. This script is used in
10conjunction with the webforms_aggregator.py script, which aggregates web pages
11with fillable forms (i.e registration forms).
12
13The purpose of this script is to extract out all non-form elements that may be
14causing parsing errors and timeout issues when running browser_tests.
15
16This script extracts all forms from a HTML file.
17If there are multiple forms per downloaded site, multiple files are created
18for each form.
19
20Used as a standalone script but assumes that it is run from the directory in
21which it is checked into.
22
23Usage: forms_extractor.py [options]
24
25Options:
26  -l LOG_LEVEL, --log_level=LOG_LEVEL,
27    LOG_LEVEL: debug, info, warning or error [default: error]
28  -j, --js  extracts javascript elements from web form.
29  -h, --help  show this help message and exit
30"""
31
32import glob
33import logging
34from optparse import OptionParser
35import os
36import re
37import sys
38
39
40class FormsExtractor(object):
41  """Extracts HTML files, leaving only registration forms from the HTML file."""
42  _HTML_FILES_PATTERN = r'*.html'
43  _HTML_FILE_PREFIX = r'grabber-'
44  _FORM_FILE_PREFIX = r'grabber-stripped-'
45
46  _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
47                                         'heuristics', 'input')
48  _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
49                                      'heuristics', 'input')
50
51  logger = logging.getLogger(__name__)
52  log_handlers = {'StreamHandler': None}
53
54  # This pattern is used for retrieving the form location comment located at the
55  # top of each downloaded HTML file indicating where the form originated from.
56  _RE_FORM_LOCATION_PATTERN = re.compile(
57      ur"""
58      <!--Form\s{1}Location:  # Starting of form location comment.
59      .*?                     # Any characters (non-greedy).
60      -->                     # Ending of the form comment.
61      """, re.U | re.S | re.I | re.X)
62
63  # This pattern is used for removing all script code.
64  _RE_SCRIPT_PATTERN = re.compile(
65      ur"""
66      <script       # A new opening '<script' tag.
67      \b            # The end of the word 'script'.
68      .*?           # Any characters (non-greedy).
69      >             # Ending of the (opening) tag: '>'.
70      .*?           # Any characters (non-greedy) between the tags.
71      </script\s*>  # The '</script>' closing tag.
72      """, re.U | re.S | re.I | re.X)
73
74  # This pattern is used for removing all href js code.
75  _RE_HREF_JS_PATTERN = re.compile(
76      ur"""
77      \bhref             # The word href and its beginning.
78      \s*=\s*            # The '=' with all whitespace before and after it.
79      (?P<quote>[\'\"])  # A single or double quote which is captured.
80      \s*javascript\s*:  # The word 'javascript:' with any whitespace possible.
81      .*?                # Any characters (non-greedy) between the quotes.
82      \1                 # The previously captured single or double quote.
83      """, re.U | re.S | re.I | re.X)
84
85  _RE_EVENT_EXPR = (
86      ur"""
87      \b                 # The beginning of a new word.
88      on\w+?             # All words starting with 'on' (non-greedy)
89                         # example: |onmouseover|.
90      \s*=\s*            # The '=' with all whitespace before and after it.
91      (?P<quote>[\'\"])  # A captured single or double quote.
92      .*?                # Any characters (non-greedy) between the quotes.
93      \1                 # The previously captured single or double quote.
94      """)
95
96  # This pattern is used for removing code with js events, such as |onload|.
97  # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
98  # pattern matches to strings such as '<tr class="nav"
99  # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
100  _RE_TAG_WITH_EVENTS_PATTERN = re.compile(
101      ur"""
102      <        # Matches character '<'.
103      [^<>]*?  # Matches any characters except '<' and '>' (non-greedy).""" +
104      _RE_EVENT_EXPR +
105      ur"""
106      [^<>]*?  # Matches any characters except '<' and '>' (non-greedy).
107      >        # Matches character '>'.
108      """, re.U | re.S | re.I | re.X)
109
110  # Adds whitespace chars at the end of the matched event. Also match trailing
111  # whitespaces for JS events. Do not match leading whitespace.
112  # For example: |< /form>| is invalid HTML and does not exist but |</form >| is
113  # considered valid HTML.
114  _RE_EVENT_PATTERN = re.compile(
115      _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X)
116
117  # This pattern is used for finding form elements.
118  _RE_FORM_PATTERN = re.compile(
119      ur"""
120      <form       # A new opening '<form' tag.
121      \b          # The end of the word 'form'.
122      .*?         # Any characters (non-greedy).
123      >           # Ending of the (opening) tag: '>'.
124      .*?         # Any characters (non-greedy) between the tags.
125      </form\s*>  # The '</form>' closing tag.
126      """, re.U | re.S | re.I | re.X)
127
128  def __init__(self, input_dir=_REGISTRATION_PAGES_DIR,
129               output_dir=_EXTRACTED_FORMS_DIR, logging_level=None):
130    """Creates a FormsExtractor object.
131
132    Args:
133      input_dir: the directory of HTML files.
134      output_dir: the directory where the registration form files will be
135                  saved.
136      logging_level: verbosity level, default is None.
137
138    Raises:
139      IOError exception if input directory doesn't exist.
140    """
141    if logging_level:
142      if not self.log_handlers['StreamHandler']:
143        console = logging.StreamHandler()
144        console.setLevel(logging.DEBUG)
145        self.log_handlers['StreamHandler'] = console
146        self.logger.addHandler(console)
147      self.logger.setLevel(logging_level)
148    else:
149      if self.log_handlers['StreamHandler']:
150        self.logger.removeHandler(self.log_handlers['StreamHandler'])
151        self.log_handlers['StreamHandler'] = None
152
153    self._input_dir = input_dir
154    self._output_dir = output_dir
155    if not os.path.isdir(self._input_dir):
156      error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir
157      self.logger.error('Error: %s', error_msg)
158      raise IOError(error_msg)
159    if not os.path.isdir(output_dir):
160      os.makedirs(output_dir)
161    self._form_location_comment = ''
162
163  def _SubstituteAllEvents(self, matchobj):
164    """Remove all js events that are present as attributes within a tag.
165
166    Args:
167      matchobj: A regexp |re.MatchObject| containing text that has at least one
168                event. Example: |<tr class="nav" onmouseover="mOvr1(this);"
169                onmouseout="mOut1(this);">|.
170
171    Returns:
172      The text containing the tag with all the attributes except for the tags
173      with events. Example: |<tr class="nav">|.
174    """
175    tag_with_all_attrs = matchobj.group(0)
176    return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs)
177
178  def Extract(self, strip_js_only):
179    """Extracts and saves the extracted registration forms.
180
181    Iterates through all the HTML files.
182
183    Args:
184      strip_js_only: If True, only Javascript is stripped from the HTML content.
185                     Otherwise, all non-form elements are stripped.
186    """
187    pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN)
188    html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)]
189    for filename in html_files:
190      self.logger.info('Stripping file "%s" ...', filename)
191      with open(filename, 'U') as f:
192        html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub(
193            self._SubstituteAllEvents,
194            self._RE_HREF_JS_PATTERN.sub(
195                '', self._RE_SCRIPT_PATTERN.sub('', f.read())))
196
197        form_filename = os.path.split(filename)[1]  # Path dropped.
198        form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1)
199        (form_filename, extension) = os.path.splitext(form_filename)
200        form_filename = (self._FORM_FILE_PREFIX + form_filename +
201                         '%s' + extension)
202        form_filename = os.path.join(self._output_dir, form_filename)
203        if strip_js_only:
204          form_filename = form_filename % ''
205          try:
206            with open(form_filename, 'w') as f:
207              f.write(html_content)
208          except IOError as e:
209            self.logger.error('Error: %s', e)
210            continue
211        else:  # Remove all non form elements.
212          match = self._RE_FORM_LOCATION_PATTERN.search(html_content)
213          if match:
214            form_location_comment = match.group() + os.linesep
215          else:
216            form_location_comment = ''
217          forms_iterator = self._RE_FORM_PATTERN.finditer(html_content)
218          for form_number, form_match in enumerate(forms_iterator, start=1):
219            form_content = form_match.group()
220            numbered_form_filename = form_filename % form_number
221            try:
222              with open(numbered_form_filename, 'w') as f:
223                f.write(form_location_comment)
224                f.write(form_content)
225            except IOError as e:
226              self.logger.error('Error: %s', e)
227              continue
228          self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename)
229
230
231def main():
232  parser = OptionParser()
233  parser.add_option(
234      '-l', '--log_level', metavar='LOG_LEVEL', default='error',
235      help='LOG_LEVEL: debug, info, warning or error [default: %default]')
236  parser.add_option(
237      '-j', '--js', dest='js', action='store_true', default=False,
238      help='Removes all javascript elements [default: %default]')
239
240  (options, args) = parser.parse_args()
241  options.log_level = options.log_level.upper()
242  if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
243    print 'Wrong log_level argument.'
244    parser.print_help()
245    return 1
246
247  options.log_level = getattr(logging, options.log_level)
248  extractor = FormsExtractor(logging_level=options.log_level)
249  extractor.Extract(options.js)
250  return 0
251
252
253if __name__ == '__main__':
254  sys.exit(main())
255