1ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch# Copyright 2013 The Chromium Authors. All rights reserved.
2ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch# Use of this source code is governed by a BSD-style license that can be
3ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch# found in the LICENSE file.
4ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
5ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochfrom collections import defaultdict, deque, namedtuple
6ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochfrom HTMLParser import HTMLParser, HTMLParseError
7bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdochfrom itertools import groupby
8bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdochfrom operator import itemgetter
9ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochimport posixpath
10ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochfrom urlparse import urlsplit
11ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
12ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochfrom file_system_util import CreateURLsFromPaths
135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from path_util import AssertIsDirectory
14f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
15ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
16ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben MurdochPage = namedtuple('Page', 'status, links, anchors, anchor_refs')
17ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
18f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
19ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochdef _SplitAnchor(url):
20ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  components = urlsplit(url)
21ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  return components.path, components.fragment
22ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
23f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
24ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochdef _Process(path, renderer):
25ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  '''Render the page at |path| using a |renderer| and process the contents of
26ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  that page. Returns a |Page| namedtuple with fields for the http status code
27ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  of the page render, the href of all the links that occurred on the page, all
28ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  of the anchors on the page (ids and names), and all links that contain an
29ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  anchor component.
30ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
31ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  If a non-html page is properly rendered, a |Page| with status code 200 and
32ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  all other fields empty is returned.
33ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  '''
34ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  parser = _ContentParser()
35ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  response = renderer(path)
36ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
37ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  if response.status != 200:
38ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    return Page(response.status, (), (), ())
39ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  if not path.endswith('.html'):
40ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    return Page(200, (), (), ())
41ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
42ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  try:
43ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    parser.feed(str(response.content))
44ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  except HTMLParseError:
45ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    return Page(200, (), (), ())
46ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
47ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  links, anchors = parser.links, parser.anchors
4858537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)  if '/' in path:
4958537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)    base, _ = path.rsplit('/', 1)
5058537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)  else:
5158537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles)    base = ''
52ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  edges = []
53ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  anchor_refs = []
54ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
55ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  # Convert relative links to absolute links and categorize links as edges
56ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  # or anchor_refs.
57ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  for link in links:
58ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    # Files like experimental_history.html are refered to with the URL
59ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    # experimental.history.html.
60ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    head, last = link.rsplit('/', 1) if '/' in link else ('', link)
61ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    last, anchor = _SplitAnchor(last)
62ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
63ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    if last.endswith('.html') and last.count('.') > 1:
64ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      last = last.replace('.', '_', last.count('.') - 1)
65ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      link = posixpath.join(head, last)
66ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      if anchor:
67ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        link = '%s#%s' % (link, anchor)
68ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
69ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    if link.startswith('#'):
70ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      anchor_refs.append(link)
71ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    else:
72ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      if link.startswith('/'):
73ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        link = link[1:]
74ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      else:
75ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        link = posixpath.normpath('%s/%s' % (base, link))
76ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
77ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      if '#' in link:
78ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        anchor_refs.append(link)
79ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      else:
80ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        edges.append(link)
81ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
82ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  return Page(200, edges, anchors, anchor_refs)
83ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
84f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
85ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochclass _ContentParser(HTMLParser):
86ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  '''Parse an html file pulling out all links and anchor_refs, where an
87ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  anchor_ref is a link that contains an anchor.
88ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  '''
89ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
90ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  def __init__(self):
91ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    HTMLParser.__init__(self)
92ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    self.links = []
93ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    self.anchors = set()
94ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
95ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  def handle_starttag(self, tag, raw_attrs):
96ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    attrs = dict(raw_attrs)
97ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
98ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    if tag == 'a':
99ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      # Handle special cases for href's that: start with a space, contain
100ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      # just a '.' (period), contain python templating code, are an absolute
101ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      # url, are a zip file, or execute javascript on the page.
102ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      href = attrs.get('href', '').strip()
103ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      if href and not href == '.' and not '{{' in href:
104ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        if not urlsplit(href).scheme in ('http', 'https'):
105ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch          if not href.endswith('.zip') and not 'javascript:' in href:
106ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch            self.links.append(href)
107ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
108ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    if attrs.get('id'):
109ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      self.anchors.add(attrs['id'])
110ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    if attrs.get('name'):
111ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      self.anchors.add(attrs['name'])
112ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
113f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
114ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochclass LinkErrorDetector(object):
115ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  '''Finds link errors on the doc server. This includes broken links, those with
116ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  a target page that 404s or contain an anchor that doesn't exist, or pages that
117ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  have no links to them.
118ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  '''
119ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
120ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  def __init__(self, file_system, renderer, public_path, root_pages):
121ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    '''Creates a new broken link detector. |renderer| is a callable that takes
122ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    a path and returns a full html page. |public_path| is the path to public
123ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    template files. All URLs in |root_pages| are used as the starting nodes for
124ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    the orphaned page search.
125ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    '''
1265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)    AssertIsDirectory(public_path)
127ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    self._file_system = file_system
128ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    self._renderer = renderer
129ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    self._public_path = public_path
130ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    self._pages = defaultdict(lambda: Page(404, (), (), ()))
131ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    self._root_pages = frozenset(root_pages)
132a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)    self._always_detached = frozenset((
133a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)        'apps/404.html',
134a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)        'extensions/404.html',
135a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)        'apps/private_apis.html',
136a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)        'extensions/private_apis.html'))
137558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
138ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
139ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    self._RenderAllPages()
140ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
141ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  def _RenderAllPages(self):
142ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    '''Traverses the public templates directory rendering each URL and
143ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    processing the resultant html to pull out all links and anchors.
144ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    '''
145ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    top_level_directories = (
1465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      ('docs/templates/public/', ''),
1475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      ('docs/static/', 'static/'),
1485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      ('docs/examples/', 'extensions/examples/'),
149ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    )
150ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
151ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    for dirpath, urlprefix in top_level_directories:
152ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
153ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      for url, path in files:
154ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        self._pages[url] = _Process(url, self._renderer)
155ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
156ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        if self._pages[url].status != 200:
157ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch          print(url, ', a url derived from the path', dirpath +
158ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch              ', resulted in a', self._pages[url].status)
159ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
160558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch  def _FollowRedirections(self, starting_url, limit=4):
161558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    '''Follow redirection until a non-redirectable page is reached. Start at
162558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    |starting_url| which must return a 301 or 302 status code.
163558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
164558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    Return a tuple of: the status of rendering |staring_url|, the final url,
165558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    and a list of the pages reached including |starting_url|. If no redirection
166558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    occurred, returns (None, None, None).
167558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    '''
168558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    pages_reached = [starting_url]
169558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    redirect_link = None
170558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    target_page = self._renderer(starting_url)
171558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    original_status = status = target_page.status
172558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    count = 0
173558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
174558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    while status in (301, 302):
175558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      if count > limit:
176558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch        return None, None, None
177558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      redirect_link = target_page.headers.get('Location')
178558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      target_page = self._renderer(redirect_link)
179558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      status = target_page.status
180558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      pages_reached.append(redirect_link)
181558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      count += 1
182558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
183558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    if redirect_link is None:
184558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      return None, None, None
185558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
186558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    return original_status, redirect_link, pages_reached
187558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
188558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch  def _CategorizeBrokenLinks(self, url, page, pages):
189558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    '''Find all broken links on a page and create appropriate notes describing
190558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    why tehy are broken (broken anchor, target redirects, etc). |page| is the
191558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    current page being checked and is the result of rendering |url|. |pages|
192558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    is a callable that takes a path and returns a Page.
193558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    '''
194558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    broken_links = []
195558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
196558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    for link in page.links + page.anchor_refs:
197558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      components = urlsplit(link)
198558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      fragment = components.fragment
199558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
200558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      if components.path == '':
201a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)        if fragment == 'top' or fragment == '':
202558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch          continue
203558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch        if not fragment in page.anchors:
204558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch          broken_links.append((200, url, link, 'target anchor not found'))
205558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      else:
206558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch        # Render the target page
207558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch        target_page = pages(components.path)
208558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
209558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch        if target_page.status != 200:
210558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch          if components.path in self._redirection_whitelist:
211558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch            continue
212558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
213558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch          status, relink, _ = self._FollowRedirections(components.path)
214558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch          if relink:
215558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch            broken_links.append((
216558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch                status,
217558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch                url,
218558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch                link,
219558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch                'redirects to %s' % relink))
220a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)          else:
221558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch            broken_links.append((
222558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch                target_page.status, url, link, 'target page not found'))
223558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
224558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch        elif fragment:
225558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch          if not fragment in target_page.anchors:
226558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch            broken_links.append((
227558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch                target_page.status, url, link, 'target anchor not found'))
228558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
229558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    return broken_links
230558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
231ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  def GetBrokenLinks(self):
232558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    '''Find all broken links. A broken link is a link that leads to a page
233558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    that does not exist (404s), redirects to another page (301 or 302), or
234558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    has an anchor whose target does not exist.
235558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
236558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    Returns a list of tuples of four elements: status, url, target_page,
237558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    notes.
238ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    '''
239ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    broken_links = []
240ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
241ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    for url in self._pages.keys():
242ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      page = self._pages[url]
243ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      if page.status != 200:
244ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        continue
245558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      broken_links.extend(self._CategorizeBrokenLinks(
246558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch          url, page, lambda x: self._pages[x]))
247ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
248558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch    return broken_links
249ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
250ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch  def GetOrphanedPages(self):
251ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    '''Crawls the server find all pages that are connected to the pages at
252ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    |seed_url|s. Return the links that are valid on the server but are not in
253ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    part of the connected component containing the |root_pages|. These pages
254ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    are orphans and cannot be reached simply by clicking through the server.
255ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    '''
256a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles)    pages_to_check = deque(self._root_pages.union(self._always_detached))
257ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    found = set(self._root_pages) | self._always_detached
258ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
259ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    while pages_to_check:
260ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch      item = pages_to_check.popleft()
261558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      target_page = self._pages[item]
262558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
263558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      if target_page.status != 200:
264558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch        redirected_page = self._FollowRedirections(item)[1]
265558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch        if not redirected_page is None:
266558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch          target_page = self._pages[redirected_page]
267558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch
268558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch      for link in target_page.links:
269ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        if link not in found:
270ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch          found.add(link)
271ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch          pages_to_check.append(link)
272ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
273ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    all_urls = set(
274ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch        [url for url, page in self._pages.iteritems() if page.status == 200])
275ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch
276ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch    return [url for url in all_urls - found if url.endswith('.html')]
277bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch
278f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles)
279bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdochdef StringifyBrokenLinks(broken_links):
280bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  '''Prints out broken links in a more readable format.
281bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  '''
282bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  def fixed_width(string, width):
283bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch    return "%s%s" % (string, (width - len(string)) * ' ')
284bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch
285bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  first_col_width = max(len(link[1]) for link in broken_links)
286bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  second_col_width = max(len(link[2]) for link in broken_links)
287bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  target = itemgetter(2)
288bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  output = []
289bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch
290bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  def pretty_print(link, col_offset=0):
291bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch    return "%s -> %s %s" % (
292bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch        fixed_width(link[1], first_col_width - col_offset),
293bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch        fixed_width(link[2], second_col_width),
294bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch        link[3])
295bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch
296bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  for target, links in groupby(sorted(broken_links, key=target), target):
297bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch    links = list(links)
298bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch    # Compress messages
299bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch    if len(links) > 50 and not links[0][2].startswith('#'):
300bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch      message = "Found %d broken links (" % len(links)
301bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch      output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
302bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch    else:
303bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch      for link in links:
304bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch        output.append(pretty_print(link))
305bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch
306bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch  return '\n'.join(output)
307