1#!/usr/bin/env python
2# Copyright (c) 2011 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Downloads web pages with fillable forms after parsing through a set of links.
7
8Used for collecting web pages with forms. Used as a standalone script.
9This script assumes that it's run from within the same directory in which it's
10checked into. If this script were to be run elsewhere then the path for
11REGISTER_PAGE_DIR needs to be changed.
12
13This script assumes that third party modules are installed:
14httplib2, lxml, pycurl.
15
16Usage: webforms_aggregator.py [options] [single url or file containing urls]
17
18Options:
19  -l LOG_LEVEL, --log_level LOG_LEVEL
20    LOG_LEVEL: debug, info, warning or error [default: error]
21  -h, --help  show this help message and exit
22"""
23
24import datetime
25import errno
26import logging
27import optparse
28import os
29import re
30# Needed in Linux so that PyCurl does not throw a segmentation fault.
31import signal
32import sys
33import tempfile
34import threading
35import time
36import urlparse
37
38import httplib2
39from lxml import html, etree
40import pycurl
41
42REGISTER_PAGE_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
43                                 'heuristics', 'input')
44NOT_FOUND_REG_PAGE_SITES_FILENAME = 'notFoundRegPageSites.txt'
45
46FORM_LOCATION_COMMENT = 'Form Location: %s'
47HTML_FILE_PREFIX = 'grabber-'
48
49MAX_REDIRECTIONS = 10
50
51# Strings in a webpage that are indicative of a registration link.
52LINK_CLUES = ['regist', 'user', 'sign', 'login', 'account']
53
54MAX_SAME_DOMAIN_URLS_NO = 30
55MAX_TOTAL_URLS_PER_DOMAIN = 300
56MAX_OPEN_FILES_NO = 500
57
58# URLs are selected for downloading with the following rules from the link
59# lists, giving more weight to the links that contain a link clue.
60CLUE_SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
61CLUE_GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
62SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
63GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
64
65MAX_ALLOWED_THREADS = MAX_OPEN_FILES_NO / MAX_SAME_DOMAIN_URLS_NO + 1
66
67
68class Retriever(object):
69  """Download, parse, and check if the web page contains a registration form.
70
71  The objects of this class has a one to one relation with the web pages. For
72  each page that is downloaded and parsed an object of this class is created.
73  Each Retriever object creates a curl object. This object is added to the curl
74  multi object of the crawler object so that the corresponding pages gets
75  downloaded.
76  """
77  logger = logging.getLogger(__name__)
78
79  def __init__(self, url, domain, cookie_file):
80    """Initializes a Retriever object.
81
82    Args:
83      url: url to download page from.
84      domain: only links with this domain will be retrieved.
85      cookie_file: the name of a cookie file, needed for pages that use session
86          cookies to change their contents.
87    """
88    self._url = url
89    self._domain = domain
90    self._html_content = ''
91
92    # Http links without clues from LINK_CLUES.
93    self._general_links = []
94    # Http links that contain a clue from LINK_CLUES.
95    self._clues_general_links = []
96    # Https links that do not contain any clues from LINK_CLUES.
97    self._secure_links = []
98    # Https links that contain a clue from LINK_CLUES.
99    self._clues_secure_links = []
100    self._cookie_file = cookie_file
101    self._curl_object = None
102
103  def __del__(self):
104    """Cleans up before this object is destroyed.
105
106    The function closes the corresponding curl object that does the downloading.
107    """
108    if self._curl_object:
109      self._curl_object.close()
110
111  def _AddLink(self, link):
112    """Adds url |link|, if not already present, to the appropriate list.
113
114    The link only gets added to the single list that is appopriate for it:
115    _secure_links, _general_links, _clues_secure_links or _clues_general_links.
116
117    Args:
118      link: the url that is inserted to the appropriate links list.
119    """
120    # Handles sites with unicode URLs.
121    if isinstance(link, unicode):
122      # Encode in 'utf-8' to avoid the UnicodeEncodeError exception.
123      link = httplib2.iri2uri(link).encode('utf-8')
124    link_parsed = urlparse.urlparse(link)
125    link_lists = [self._clues_secure_links, self._secure_links,
126                  self._clues_general_links, self._general_links]
127    # Checks that the registration page is within the domain.
128    if (self._domain in link_parsed[1] and
129        all(link not in x for x in link_lists)):
130      for clue in LINK_CLUES:
131        if clue in link.lower():
132          if link_parsed[0].startswith('https'):
133            self._clues_secure_links.append(link)
134            return
135          else:
136            self._clues_general_links.append(link)
137            return
138      if link_parsed[0].startswith('https'):  # No clues found in the link.
139        self._secure_links.append(link)
140      else:
141        self._general_links.append(link)
142
143  def ParseAndGetLinks(self):
144    """Parses downloaded page and gets url link for non registration page.
145
146    Checks if current page contains a registration page and if not it gets
147    the url links. If it is a registration page, it saves it in a file as
148    'grabber-' + domain + '.html' after it has added the FORM_LOCATION_COMMENT
149    and it returns True. Otherwise it returns False.
150
151    Returns:
152      True if current page contains a registration form, and False otherwise.
153
154    Raises:
155      IOError: When can't write to the file.
156    """
157    if not self._domain:
158      self.logger.error('Error: self._domain was not set')
159      sys.exit(1)
160    match_list = re.findall(r'(?P<quote>[\'\"])(?P<link>(?:https?:)?//.*?)\1',
161                             self._html_content)
162    for group_list in match_list:
163      link = group_list[1]
164      if link.startswith('//'):
165        link = urlparse.urljoin(self._url, link)
166      self._AddLink(link)
167    try:
168      tree = html.fromstring(self._html_content, parser=html.HTMLParser())
169    except etree.LxmlError:
170      self.logger.info('\t\tSkipping: not valid HTML code in this page <<< %s',
171                       self._url)
172      return False
173    try:
174      body = tree.iter('body').next()
175    except StopIteration:
176      self.logger.info('\t\tSkipping: no "BODY" tag in this page <<< %s',
177                       self._url)
178      return False
179
180    # Get a list of all input elements with attribute type='password'
181    password_elements = list(body.iterfind('.//input[@type="password"]'))
182    # Check for multiple password elements to distinguish between a login form
183    # and a registration form (Password field and Confirm Password field).
184    if password_elements and len(password_elements) >= 2:
185      form_elements = []
186      for password_elem in password_elements:
187        form_elem = password_elem.xpath('ancestor::form[1]')
188        if not form_elem:
189          continue
190        if not form_elem[0] in form_elements:
191          form_elements.append(form_elem[0])
192        else:
193          # Confirms that the page contains a registration form if two passwords
194          # are contained in the same form for form_elem[0].
195          if not os.path.isdir(REGISTER_PAGE_DIR):
196            os.makedirs(REGISTER_PAGE_DIR)
197          # Locate the HTML tag and insert the form location comment after it.
198          html_tag = tree.iter('html').next()
199          comment = etree.Comment(FORM_LOCATION_COMMENT % self._url)
200          html_tag.insert(0, comment)
201          # Create a new file and save the HTML registration page code.
202          f = open('%s/%s%s.html' % (REGISTER_PAGE_DIR, HTML_FILE_PREFIX,
203                                     self._domain), 'w')
204          try:
205            f.write(html.tostring(tree, pretty_print=True))
206          except IOError as e:
207            self.logger.error('Error: %s', e)
208            raise
209          finally:
210            f.close()
211          return True  # Registration page found.
212    # Indicates page is not a registration page and links must be parsed.
213    link_elements = list(body.iter('a'))
214    for link_elem in link_elements:
215      link = link_elem.get('href')
216      if not link or '#' == link[0]:
217        continue
218      link = urlparse.urljoin(self._url, link)
219      link_parsed = urlparse.urlparse(link)
220      if not link_parsed[0].startswith('http'):
221        continue
222      self._AddLink(link)
223    return False  # Registration page not found.
224
225  def InitRequestHead(self):
226    """Initializes curl object for a HEAD request.
227
228    A HEAD request is initiated so that we can check from the headers if this is
229    a valid HTML file. If it is not a valid HTML file, then we do not initiate a
230    GET request, saving any unnecessary downloadings.
231    """
232    self._curl_object = pycurl.Curl()
233    self._curl_object.setopt(pycurl.URL, self._url)
234    # The following line fixes the GnuTLS package error that pycurl depends
235    # on for getting https pages.
236    self._curl_object.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)
237    self._curl_object.setopt(pycurl.FOLLOWLOCATION, True)
238    self._curl_object.setopt(pycurl.NOBODY, True)
239    self._curl_object.setopt(pycurl.SSL_VERIFYPEER, False);
240    self._curl_object.setopt(pycurl.MAXREDIRS, MAX_REDIRECTIONS)
241    self._curl_object.setopt(pycurl.FAILONERROR, False)
242    self._curl_object.setopt(pycurl.COOKIEFILE, self._cookie_file)
243    self._curl_object.setopt(pycurl.COOKIEJAR, self._cookie_file)
244    self._curl_object.setopt(pycurl.CONNECTTIMEOUT, 30)
245    self._curl_object.setopt(pycurl.TIMEOUT, 300)
246    self._curl_object.setopt(pycurl.NOSIGNAL, 1)
247
248  def InitRequestGet(self):
249    """Initializes curl object for a GET request.
250
251    This is called only for valid HTML files. The Pycurl makes a GET request.
252    The page begins to download, but since not all the data of the pages comes
253    at once. When some of the data on the page is downloaded Pycurl will put
254    this data in the buffer. The data is appended to the end of the page until
255    everything is downloaded.
256    """
257    self._curl_object.setopt(pycurl.NOBODY, False)
258    self._curl_object.setopt(
259        pycurl.WRITEFUNCTION, lambda buff: setattr(
260            self, '_html_content', self._html_content + buff))
261
262  def Download(self):
263    """Downloads the self._url page.
264
265    It first does a HEAD request and then it proceeds to a GET request.
266    It uses a curl object for a single download. This function is called only
267    once for the initial url of a site when we still don't have more urls from a
268    domain.
269
270    Returns:
271      True, if the downloaded page is valid HTML code, or False otherwise.
272    """
273    self.InitRequestHead()
274    try:
275      self._curl_object.perform()
276    except pycurl.error as e:
277      self.logger.error('Error: %s, url: %s', e, self._url)
278      return False
279    self._url = urlparse.urljoin(
280        self._url, self._curl_object.getinfo(pycurl.EFFECTIVE_URL))
281    content_type = self._curl_object.getinfo(pycurl.CONTENT_TYPE)
282    if content_type and ('text/html' in content_type.lower()):
283      self.InitRequestGet()
284      try:
285        self._curl_object.perform()
286      except pycurl.error as e:
287        self.logger.error('Error: %s, url: %s', e, self._url)
288        return False
289      return True
290    else:
291      self.logger.info('\tSkipping: Not an HTML page <<< %s', self._url)
292      return False
293
294  def Run(self):
295    """Called only once for the initial url when we do not have more urls.
296
297    Downloads the originally-specified site url, parses it and gets the links.
298
299    Returns:
300      True, if a registration page is found, and False otherwise.
301    """
302    if self.Download():
303      if not self._domain:
304        url_parsed = urlparse.urlparse(self._url)
305        self._domain = url_parsed[1]
306        if self._domain.startswith('www'):
307          self._domain = '.'.join(self._domain.split('.')[1:])
308      if self.ParseAndGetLinks():
309        return True
310    return False
311
312
313class Crawler(object):
314  """Crawls a site until a registration page is found or max level is reached.
315
316  Creates, uses and destroys Retriever objects. Creates a cookie temp file
317  needed for session cookies. It keeps track of 'visited links' and
318  'links to visit' of the site. To do this it uses the links discovered from
319  each Retriever object. Use Run() to crawl the site.
320  """
321  try:
322    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
323  except ImportError:
324    pass
325  logger = logging.getLogger(__name__)
326
327  def __init__(self, url, logging_level=None):
328    """Init crawler URL, links lists, logger, and creates a cookie temp file.
329
330    The cookie temp file is needed for session cookies.
331
332    Args:
333      url: the initial "seed" url of the site.
334      logging_level: the desired verbosity level, default is None.
335    """
336    if logging_level:
337      self.logger.setLevel(logging_level)
338
339    self.url_error = False
340    url_parsed = urlparse.urlparse(url)
341    if not url_parsed[0].startswith('http'):
342      self.logger.error(
343          'Error: "%s" does not begin with http:// or https://', url)
344      self.url_error = True
345      return
346    # Example: if url is 'http://www.example.com?name=john' then value [1] or
347    # network location is 'www.example.com'.
348    if not url_parsed[1]:
349      self.logger.error('Error: "%s" is not a valid url', url)
350      self.url_error = True
351      return
352    self._url = url
353    self._domain = ''
354    # Http links that contain a clue from LINK_CLUES.
355    self._clues_general_links = []
356    # Http links that do not contain any clue from LINK_CLUES.
357    self._general_links = []
358    # Https links that contain a clue from LINK_CLUES.
359    self._clues_secure_links = []
360    # Https links that do not contain any clue from LINK_CLUES.
361    self._secure_links = []
362    # All links downloaded and parsed so far.
363    self._links_visited = []
364    self._retrievers_list = []
365    self._cookie_file = tempfile.NamedTemporaryFile(
366        suffix='.cookie', delete=False)
367    self._cookie_file.close()
368    self._cookie_file = self._cookie_file.name  # Keep only the filename.
369
370  def __del__(self):
371    """Deletes cookie file when Crawler instances are destroyed."""
372    if hasattr(self, '_cookie_file'):
373      self.logger.info('Deleting cookie file %s ...', self._cookie_file)
374      os.unlink(self._cookie_file)
375
376  def _MultiPerform(self, curl_multi_object):
377    """Performs concurrent downloads using a CurlMulti object.
378
379    Args:
380      curl_multi_object: a curl object that downloads multiple pages
381          concurrently. The class of this object is |pycurl.CurlMulti|.
382    """
383    # Following code uses the example from section for the CurlMulti object
384    # at http://pycurl.sourceforge.net/doc/curlmultiobject.html.
385    while True:
386      ret, no_handles = curl_multi_object.perform()
387      if ret != pycurl.E_CALL_MULTI_PERFORM:
388        break
389    while no_handles:
390      curl_multi_object.select(1.0)
391      while True:
392        ret, no_handles = curl_multi_object.perform()
393        if ret != pycurl.E_CALL_MULTI_PERFORM:
394          break
395
396  def _GetLinksPages(self, curl_multi_object):
397    """Downloads many pages concurrently using a CurlMulti Object.
398
399    Creates many Retriever objects and adds them to a list. The constant
400    MAX_SAME_DOMAIN_URLS_NO defines the number of pages that can be downloaded
401    concurrently from the same domain using the pycurl multi object. It's
402    currently set to 30 URLs. These URLs are taken from the links lists, which
403    are from csl, gcl, sl, and gl. The rules define how many URLs are taken from
404    each list during each iteration.
405
406    Example of the rules:
407      3/10 from csl results in 9 URLs
408      3/10 from cgl results in 9 URLs
409      2/10 from sl results in 6 URLs
410      2/10 from gl results in 6 URLs
411
412    Adding up the above URLs gives 30 URLs that can be downloaded concurrently.
413    If these lists have fewer items than the defined rules, such as if a site
414    does not contain any secure links, then csl and sl lists will be of 0 length
415    and only 15 pages would be downloaded concurrently from the same domain.
416
417    Since 30 URLs can be handled concurrently, the number of links taken from
418    other lists can be increased. This means that we can take 24 links from the
419    cgl list so that 24 from gfl + 6 from gl = 30 URLs. If the cgl list has less
420    than 24 links, e.g. there are only 21 links, then only 9 links may be taken
421    from gl so ) + 21 + 0 + 9 = 30.
422
423    Args:
424      curl_multi_object: Each Retriever object has a curl object which is
425          added to the CurlMulti Object.
426    """
427    self._retrievers_list = []
428
429    csl_no = min(CLUE_SECURE_LINKS_NO, len(self._clues_secure_links))
430    cgl_no = min(CLUE_GENERAL_LINKS_NO, len(self._clues_general_links))
431    sl_no = min(SECURE_LINKS_NO, len(self._secure_links))
432    gl_no = min(GENERAL_LINKS_NO, len(self._general_links))
433
434    # If some links within the list have fewer items than needed, the missing
435    # links will be taken by the following priority: csl, cgl, sl, gl.
436    # c: clues, s: secure, g: general, l: list.
437    spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
438    if spare_links > 0:
439      csl_no = min(csl_no + spare_links, len(self._clues_secure_links))
440      spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
441    if spare_links > 0:
442      cgl_no = min(cgl_no + spare_links, len(self._clues_general_links))
443      spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
444    if spare_links > 0:
445      sl_no = min(sl_no + spare_links, len(self._secure_links))
446      spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
447    if spare_links > 0:
448      gl_no = min(gl_no + spare_links, len(self._general_links))
449
450    for no_of_links, links in [
451        (csl_no, self._clues_secure_links),
452        (sl_no, self._secure_links),
453        (cgl_no, self._clues_general_links),
454        (gl_no, self._general_links)]:
455      for i in xrange(no_of_links):
456        if not links:
457          break
458        url = links.pop(0)
459        self._links_visited.append(url)
460        r = Retriever(url, self._domain, self._cookie_file)
461        r.InitRequestHead()
462        curl_multi_object.add_handle(r._curl_object)
463        self._retrievers_list.append(r)
464
465    if self._retrievers_list:
466      try:
467        self._MultiPerform(curl_multi_object)
468      except pycurl.error as e:
469        self.logger.error('Error: %s, url: %s', e, self._url)
470      finally:
471        for r in self._retrievers_list:
472          curl_multi_object.remove_handle(r._curl_object)
473      # |_retrievers_list[:]| is a copy of |_retrievers_list| to avoid removing
474      # items from the iterated list.
475      for r in self._retrievers_list[:]:
476        r._url = urlparse.urljoin(r._url, r._curl_object.getinfo(
477            pycurl.EFFECTIVE_URL))
478        content_type = r._curl_object.getinfo(pycurl.CONTENT_TYPE)
479        if content_type and ('text/html' in content_type.lower()):
480          r.InitRequestGet()
481          curl_multi_object.add_handle(r._curl_object)
482        else:
483          self._retrievers_list.remove(r)
484          self.logger.info('\tSkipping: Not an HTML page <<< %s', r._url)
485      if self._retrievers_list:
486        try:
487          self._MultiPerform(curl_multi_object)
488        except pycurl.error as e:
489          self.logger.error('Error: %s, url: %s', e, self._url)
490        finally:
491          for r in self._retrievers_list:
492            curl_multi_object.remove_handle(r._curl_object)
493            self.logger.info('Downloaded: %s', r._url)
494
495  def _LogRegPageFound(self, retriever):
496    """Display logging for registration page found.
497
498    Args:
499      retriever: The object that has retrieved the page.
500    """
501    self.logger.info('\t##############################################')
502    self.logger.info('\t### %s ###', retriever._domain)
503    self.logger.info('\t##############################################')
504    self.logger.info('\t!!!!!!!!!  registration page FOUND !!!!!!!!!!!')
505    self.logger.info('\t%s', retriever._url)
506    self.logger.info('\t##############################################')
507
508  def _GetNewLinks(self, retriever):
509    """Appends new links discovered by each retriever to the appropriate lists.
510
511    Links are copied to the links list of the crawler object, which holds all
512    the links found from all retrievers that the crawler object created. The
513    Crawler object exists as far as a specific site is examined and the
514    Retriever object exists as far as a page of this site is examined.
515
516    Args:
517      retriever: a temporary object that downloads a specific page, parses the
518          content and gets the page's href link.
519    """
520    for link in retriever._clues_secure_links:
521      if (not link in self._clues_secure_links and
522          not link in self._links_visited):
523        self._clues_secure_links.append(link)
524    for link in retriever._secure_links:
525      if (not link in self._secure_links and
526          not link in self._links_visited):
527        self._secure_links.append(link)
528    for link in retriever._clues_general_links:
529      if (not link in self._clues_general_links and
530          not link in self._links_visited):
531        self._clues_general_links.append(link)
532    for link in retriever._general_links:
533      if (not link in self._general_links and
534          not link in self._links_visited):
535        self._general_links.append(link)
536
537  def Run(self):
538    """Runs the Crawler.
539
540    Creates a Retriever object and calls its run method to get the first links,
541    and then uses CurlMulti object and creates many Retriever objects to get
542    the subsequent pages.
543
544    The number of pages (=Retriever objs) created each time is restricted by
545    MAX_SAME_DOMAIN_URLS_NO. After this number of Retriever objects download
546    and parse their pages, we do the same again. The number of total pages
547    visited is kept in urls_visited.
548    If no registration page is found, the Crawler object will give up its try
549    after MAX_TOTAL_URLS_PER_DOMAIN is reached.
550
551    Returns:
552      True is returned if registration page is found, or False otherwise.
553    """
554    reg_page_found = False
555    if self.url_error:
556      return False
557    r = Retriever(self._url, self._domain, self._cookie_file)
558    if r.Run():
559      self._LogRegPageFound(r)
560      reg_page_found = True
561    else:
562      self._url = r._url
563      self._domain = r._domain
564      self.logger.info('url to crawl: %s', self._url)
565      self.logger.info('domain: %s', self._domain)
566      self._links_visited.append(r._url)
567      self._GetNewLinks(r)
568      urls_visited = 1
569      while True:
570        if (not (self._clues_secure_links or self._secure_links or
571                self._clues_general_links or self._general_links) or
572            urls_visited >= MAX_TOTAL_URLS_PER_DOMAIN):
573          break  # Registration page not found.
574        m = pycurl.CurlMulti()
575        self._GetLinksPages(m)
576        urls_visited += len(self._retrievers_list)
577        self.logger.info('\t<----- URLs visited for domain "%s": %d ----->',
578                         self._domain, urls_visited)
579        for r in self._retrievers_list:
580          if r.ParseAndGetLinks():
581            self._LogRegPageFound(r)
582            reg_page_found = True
583            break
584          else:
585            self.logger.info('parsed: %s', r._url)
586            self._GetNewLinks(r)
587        m.close()
588        if reg_page_found:
589          break
590    while self._retrievers_list:
591      r = self._retrievers_list.pop()
592    return reg_page_found
593
594
595class WorkerThread(threading.Thread):
596  """Creates a new thread of execution."""
597  def __init__(self, url):
598    """Creates _url and page_found attri to populate urls_with_no_reg_page file.
599
600    Used after thread's termination for the creation of a file with a list of
601    the urls for which a registration page wasn't found.
602
603    Args:
604      url: will be used as an argument to create a Crawler object later.
605    """
606    threading.Thread.__init__(self)
607    self._url = url
608    self.page_found = False
609
610  def run(self):
611    """Execution of thread creates a Crawler object and runs it.
612
613    Caution: this function name should not be changed to 'Run' or any other
614    names because it is overriding the 'run' method of the 'threading.Thread'
615    class. Otherwise it will never be called.
616    """
617    self.page_found = Crawler(self._url).Run()
618
619
620class ThreadedCrawler(object):
621  """Calls the Run function of WorkerThread which creates & runs a Crawler obj.
622
623  The crawler object runs concurrently, examining one site each.
624  """
625  logger = logging.getLogger(__name__)
626
627  def __init__(self, urls_file, logging_level=None):
628    """Creates threaded Crawler objects.
629
630    Args:
631      urls_file: a text file containing a URL in each line.
632      logging_level: verbosity level, default is None.
633
634    Raises:
635      IOError: If cannot find URLs from the list.
636    """
637    if logging_level:
638      self.logger.setLevel(logging_level)
639
640    self._urls_list = []
641    f = open(urls_file)
642    try:
643      for url in f.readlines():
644        url = url.strip()
645        if not urlparse.urlparse(url)[0].startswith('http'):
646          self.logger.info(
647              '%s: skipping this (does not begin with "http://")', url)
648          continue
649        self._urls_list.append(url)
650    except IOError as e:
651      self.logger.error('Error: %s', e)
652      raise
653    finally:
654      f.close()
655    if not self._urls_list:
656      error_msg = 'No URLs were found.'
657      self.logger.error('ERROR: %s', error_msg)
658      raise IOError(error_msg)
659
660  def Run(self):
661    """Runs Crawler objects using python threads.
662
663    Number of concurrent threads is restricted to MAX_ALLOWED_THREADS.
664
665    Returns:
666      The number of registration pages found. -1 if no URLs are given.
667
668    Raises:
669      OSError: When creating the same directory that already exists.
670    """
671    if self._urls_list:
672      allThreads = []
673      # originalNumThreads is the number of threads just before the
674      # ThreadedCrawler starts creating new threads. As a standalone script it
675      # will be 1.
676      originalNumThreads = threading.active_count()
677      for url in self._urls_list:
678        self.logger.info('URL fed to a crawler thread: %s', url)
679        t = WorkerThread(url)
680        t.start()
681        allThreads.append(t)
682        while threading.active_count() >= (
683            MAX_ALLOWED_THREADS + originalNumThreads):
684          time.sleep(.4)
685      while threading.active_count() > originalNumThreads:
686        time.sleep(.4)
687      self.logger.info('----------------')
688      self.logger.info('--- FINISHED ---')
689      self.logger.info('----------------')
690      urls_no = 0
691      urls_not_found_no = 0
692      not_file_name = os.path.join(
693          REGISTER_PAGE_DIR, NOT_FOUND_REG_PAGE_SITES_FILENAME)
694      not_file_dir = os.path.dirname(not_file_name)
695      try:
696        os.makedirs(not_file_dir)
697      except OSError as e:
698        if e.errno != errno.EEXIST:
699          raise
700      fnot = open(not_file_name, 'wb')
701      try:
702        for t in sorted(allThreads, key=lambda t: t._url):
703          urls_no += 1
704          if not t.page_found:
705            urls_not_found_no += 1
706            fnot.write('%s' % t._url)
707            fnot.write(os.linesep)
708      except IOError as e:
709        self.logger.error('Error: %s', e)
710      finally:
711        fnot.close()
712      self.logger.info('Total number of URLs given: %d\n', urls_no)
713      self.logger.info(
714          'Registration pages found: %d\n', (urls_no - urls_not_found_no))
715      self.logger.info(
716          'URLs that did not return a registration page: %d\n',
717          urls_not_found_no)
718      return urls_no - urls_not_found_no
719    else:
720      self.logger.error('Error: no URLs were found.')
721      return -1
722
723
724def main():
725  usage = 'usage: %prog [options] single_url_or_urls_filename'
726  parser = optparse.OptionParser(usage)
727  parser.add_option(
728      '-l', '--log_level', metavar='LOG_LEVEL', default='error',
729      help='LOG_LEVEL: debug, info, warning or error [default: %default]')
730
731  (options, args) = parser.parse_args()
732  options.log_level = options.log_level.upper()
733  if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
734    print 'Wrong log_level argument.'
735    parser.print_help()
736    return 1
737  options.log_level = getattr(logging, options.log_level)
738
739  if len(args) != 1:
740    parser.error('Wrong number of arguments.')
741
742  logger = logging.getLogger(__name__)
743  if options.log_level:
744    console = logging.StreamHandler()
745    logger.addHandler(console)
746    logger.setLevel(options.log_level)
747
748  arg_is_a_file = os.path.isfile(args[0])
749  if arg_is_a_file:
750    CrawlerClass = ThreadedCrawler
751  else:
752    CrawlerClass = Crawler
753  t0 = datetime.datetime.now()
754  c = CrawlerClass(args[0], options.log_level)
755  c.Run()
756  if not arg_is_a_file and c.url_error:
757    logger.error(
758        'ERROR: "%s" is neither a valid filename nor a valid URL' % args[0])
759  t1 = datetime.datetime.now()
760  delta_t = t1 - t0
761  logger.info('Started at: %s\n', t0)
762  logger.info('Ended at: %s\n', t1)
763  logger.info('Total execution time: %s\n', delta_t)
764  return 0
765
766
767if __name__ == "__main__":
768  sys.exit(main())
769