1#!/usr/bin/env python 2# Copyright (c) 2011 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6 7"""Extracts registration forms from the corresponding HTML files. 8 9Used for extracting forms within HTML files. This script is used in 10conjunction with the webforms_aggregator.py script, which aggregates web pages 11with fillable forms (i.e registration forms). 12 13The purpose of this script is to extract out all non-form elements that may be 14causing parsing errors and timeout issues when running browser_tests. 15 16This script extracts all forms from a HTML file. 17If there are multiple forms per downloaded site, multiple files are created 18for each form. 19 20Used as a standalone script but assumes that it is run from the directory in 21which it is checked into. 22 23Usage: forms_extractor.py [options] 24 25Options: 26 -l LOG_LEVEL, --log_level=LOG_LEVEL, 27 LOG_LEVEL: debug, info, warning or error [default: error] 28 -j, --js extracts javascript elements from web form. 29 -h, --help show this help message and exit 30""" 31 32import glob 33import logging 34from optparse import OptionParser 35import os 36import re 37import sys 38 39 40class FormsExtractor(object): 41 """Extracts HTML files, leaving only registration forms from the HTML file.""" 42 _HTML_FILES_PATTERN = r'*.html' 43 _HTML_FILE_PREFIX = r'grabber-' 44 _FORM_FILE_PREFIX = r'grabber-stripped-' 45 46 _REGISTRATION_PAGES_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill', 47 'heuristics', 'input') 48 _EXTRACTED_FORMS_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill', 49 'heuristics', 'input') 50 51 logger = logging.getLogger(__name__) 52 log_handlers = {'StreamHandler': None} 53 54 # This pattern is used for retrieving the form location comment located at the 55 # top of each downloaded HTML file indicating where the form originated from. 56 _RE_FORM_LOCATION_PATTERN = re.compile( 57 ur""" 58 <!--Form\s{1}Location: # Starting of form location comment. 59 .*? # Any characters (non-greedy). 60 --> # Ending of the form comment. 61 """, re.U | re.S | re.I | re.X) 62 63 # This pattern is used for removing all script code. 64 _RE_SCRIPT_PATTERN = re.compile( 65 ur""" 66 <script # A new opening '<script' tag. 67 \b # The end of the word 'script'. 68 .*? # Any characters (non-greedy). 69 > # Ending of the (opening) tag: '>'. 70 .*? # Any characters (non-greedy) between the tags. 71 </script\s*> # The '</script>' closing tag. 72 """, re.U | re.S | re.I | re.X) 73 74 # This pattern is used for removing all href js code. 75 _RE_HREF_JS_PATTERN = re.compile( 76 ur""" 77 \bhref # The word href and its beginning. 78 \s*=\s* # The '=' with all whitespace before and after it. 79 (?P<quote>[\'\"]) # A single or double quote which is captured. 80 \s*javascript\s*: # The word 'javascript:' with any whitespace possible. 81 .*? # Any characters (non-greedy) between the quotes. 82 \1 # The previously captured single or double quote. 83 """, re.U | re.S | re.I | re.X) 84 85 _RE_EVENT_EXPR = ( 86 ur""" 87 \b # The beginning of a new word. 88 on\w+? # All words starting with 'on' (non-greedy) 89 # example: |onmouseover|. 90 \s*=\s* # The '=' with all whitespace before and after it. 91 (?P<quote>[\'\"]) # A captured single or double quote. 92 .*? # Any characters (non-greedy) between the quotes. 93 \1 # The previously captured single or double quote. 94 """) 95 96 # This pattern is used for removing code with js events, such as |onload|. 97 # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the 98 # pattern matches to strings such as '<tr class="nav" 99 # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">' 100 _RE_TAG_WITH_EVENTS_PATTERN = re.compile( 101 ur""" 102 < # Matches character '<'. 103 [^<>]*? # Matches any characters except '<' and '>' (non-greedy).""" + 104 _RE_EVENT_EXPR + 105 ur""" 106 [^<>]*? # Matches any characters except '<' and '>' (non-greedy). 107 > # Matches character '>'. 108 """, re.U | re.S | re.I | re.X) 109 110 # Adds whitespace chars at the end of the matched event. Also match trailing 111 # whitespaces for JS events. Do not match leading whitespace. 112 # For example: |< /form>| is invalid HTML and does not exist but |</form >| is 113 # considered valid HTML. 114 _RE_EVENT_PATTERN = re.compile( 115 _RE_EVENT_EXPR + ur'\s*', re.U | re.S | re.I | re.X) 116 117 # This pattern is used for finding form elements. 118 _RE_FORM_PATTERN = re.compile( 119 ur""" 120 <form # A new opening '<form' tag. 121 \b # The end of the word 'form'. 122 .*? # Any characters (non-greedy). 123 > # Ending of the (opening) tag: '>'. 124 .*? # Any characters (non-greedy) between the tags. 125 </form\s*> # The '</form>' closing tag. 126 """, re.U | re.S | re.I | re.X) 127 128 def __init__(self, input_dir=_REGISTRATION_PAGES_DIR, 129 output_dir=_EXTRACTED_FORMS_DIR, logging_level=None): 130 """Creates a FormsExtractor object. 131 132 Args: 133 input_dir: the directory of HTML files. 134 output_dir: the directory where the registration form files will be 135 saved. 136 logging_level: verbosity level, default is None. 137 138 Raises: 139 IOError exception if input directory doesn't exist. 140 """ 141 if logging_level: 142 if not self.log_handlers['StreamHandler']: 143 console = logging.StreamHandler() 144 console.setLevel(logging.DEBUG) 145 self.log_handlers['StreamHandler'] = console 146 self.logger.addHandler(console) 147 self.logger.setLevel(logging_level) 148 else: 149 if self.log_handlers['StreamHandler']: 150 self.logger.removeHandler(self.log_handlers['StreamHandler']) 151 self.log_handlers['StreamHandler'] = None 152 153 self._input_dir = input_dir 154 self._output_dir = output_dir 155 if not os.path.isdir(self._input_dir): 156 error_msg = 'Directory "%s" doesn\'t exist.' % self._input_dir 157 self.logger.error('Error: %s', error_msg) 158 raise IOError(error_msg) 159 if not os.path.isdir(output_dir): 160 os.makedirs(output_dir) 161 self._form_location_comment = '' 162 163 def _SubstituteAllEvents(self, matchobj): 164 """Remove all js events that are present as attributes within a tag. 165 166 Args: 167 matchobj: A regexp |re.MatchObject| containing text that has at least one 168 event. Example: |<tr class="nav" onmouseover="mOvr1(this);" 169 onmouseout="mOut1(this);">|. 170 171 Returns: 172 The text containing the tag with all the attributes except for the tags 173 with events. Example: |<tr class="nav">|. 174 """ 175 tag_with_all_attrs = matchobj.group(0) 176 return self._RE_EVENT_PATTERN.sub('', tag_with_all_attrs) 177 178 def Extract(self, strip_js_only): 179 """Extracts and saves the extracted registration forms. 180 181 Iterates through all the HTML files. 182 183 Args: 184 strip_js_only: If True, only Javascript is stripped from the HTML content. 185 Otherwise, all non-form elements are stripped. 186 """ 187 pathname_pattern = os.path.join(self._input_dir, self._HTML_FILES_PATTERN) 188 html_files = [f for f in glob.glob(pathname_pattern) if os.path.isfile(f)] 189 for filename in html_files: 190 self.logger.info('Stripping file "%s" ...', filename) 191 with open(filename, 'U') as f: 192 html_content = self._RE_TAG_WITH_EVENTS_PATTERN.sub( 193 self._SubstituteAllEvents, 194 self._RE_HREF_JS_PATTERN.sub( 195 '', self._RE_SCRIPT_PATTERN.sub('', f.read()))) 196 197 form_filename = os.path.split(filename)[1] # Path dropped. 198 form_filename = form_filename.replace(self._HTML_FILE_PREFIX, '', 1) 199 (form_filename, extension) = os.path.splitext(form_filename) 200 form_filename = (self._FORM_FILE_PREFIX + form_filename + 201 '%s' + extension) 202 form_filename = os.path.join(self._output_dir, form_filename) 203 if strip_js_only: 204 form_filename = form_filename % '' 205 try: 206 with open(form_filename, 'w') as f: 207 f.write(html_content) 208 except IOError as e: 209 self.logger.error('Error: %s', e) 210 continue 211 else: # Remove all non form elements. 212 match = self._RE_FORM_LOCATION_PATTERN.search(html_content) 213 if match: 214 form_location_comment = match.group() + os.linesep 215 else: 216 form_location_comment = '' 217 forms_iterator = self._RE_FORM_PATTERN.finditer(html_content) 218 for form_number, form_match in enumerate(forms_iterator, start=1): 219 form_content = form_match.group() 220 numbered_form_filename = form_filename % form_number 221 try: 222 with open(numbered_form_filename, 'w') as f: 223 f.write(form_location_comment) 224 f.write(form_content) 225 except IOError as e: 226 self.logger.error('Error: %s', e) 227 continue 228 self.logger.info('\tFile "%s" extracted SUCCESSFULLY!', filename) 229 230 231def main(): 232 parser = OptionParser() 233 parser.add_option( 234 '-l', '--log_level', metavar='LOG_LEVEL', default='error', 235 help='LOG_LEVEL: debug, info, warning or error [default: %default]') 236 parser.add_option( 237 '-j', '--js', dest='js', action='store_true', default=False, 238 help='Removes all javascript elements [default: %default]') 239 240 (options, args) = parser.parse_args() 241 options.log_level = options.log_level.upper() 242 if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']: 243 print 'Wrong log_level argument.' 244 parser.print_help() 245 return 1 246 247 options.log_level = getattr(logging, options.log_level) 248 extractor = FormsExtractor(logging_level=options.log_level) 249 extractor.Extract(options.js) 250 return 0 251 252 253if __name__ == '__main__': 254 sys.exit(main()) 255