1# Copyright 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from collections import defaultdict, deque, namedtuple
6from HTMLParser import HTMLParser, HTMLParseError
7from itertools import groupby
8from operator import itemgetter
9import posixpath
10from urlparse import urlsplit
11
12from file_system_util import CreateURLsFromPaths
13from path_util import AssertIsDirectory
14
15
16Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
17
18
19def _SplitAnchor(url):
20  components = urlsplit(url)
21  return components.path, components.fragment
22
23
24def _Process(path, renderer):
25  '''Render the page at |path| using a |renderer| and process the contents of
26  that page. Returns a |Page| namedtuple with fields for the http status code
27  of the page render, the href of all the links that occurred on the page, all
28  of the anchors on the page (ids and names), and all links that contain an
29  anchor component.
30
31  If a non-html page is properly rendered, a |Page| with status code 200 and
32  all other fields empty is returned.
33  '''
34  parser = _ContentParser()
35  response = renderer(path)
36
37  if response.status != 200:
38    return Page(response.status, (), (), ())
39  if not path.endswith('.html'):
40    return Page(200, (), (), ())
41
42  try:
43    parser.feed(str(response.content))
44  except HTMLParseError:
45    return Page(200, (), (), ())
46
47  links, anchors = parser.links, parser.anchors
48  if '/' in path:
49    base, _ = path.rsplit('/', 1)
50  else:
51    base = ''
52  edges = []
53  anchor_refs = []
54
55  # Convert relative links to absolute links and categorize links as edges
56  # or anchor_refs.
57  for link in links:
58    # Files like experimental_history.html are refered to with the URL
59    # experimental.history.html.
60    head, last = link.rsplit('/', 1) if '/' in link else ('', link)
61    last, anchor = _SplitAnchor(last)
62
63    if last.endswith('.html') and last.count('.') > 1:
64      last = last.replace('.', '_', last.count('.') - 1)
65      link = posixpath.join(head, last)
66      if anchor:
67        link = '%s#%s' % (link, anchor)
68
69    if link.startswith('#'):
70      anchor_refs.append(link)
71    else:
72      if link.startswith('/'):
73        link = link[1:]
74      else:
75        link = posixpath.normpath('%s/%s' % (base, link))
76
77      if '#' in link:
78        anchor_refs.append(link)
79      else:
80        edges.append(link)
81
82  return Page(200, edges, anchors, anchor_refs)
83
84
85class _ContentParser(HTMLParser):
86  '''Parse an html file pulling out all links and anchor_refs, where an
87  anchor_ref is a link that contains an anchor.
88  '''
89
90  def __init__(self):
91    HTMLParser.__init__(self)
92    self.links = []
93    self.anchors = set()
94
95  def handle_starttag(self, tag, raw_attrs):
96    attrs = dict(raw_attrs)
97
98    if tag == 'a':
99      # Handle special cases for href's that: start with a space, contain
100      # just a '.' (period), contain python templating code, are an absolute
101      # url, are a zip file, or execute javascript on the page.
102      href = attrs.get('href', '').strip()
103      if href and not href == '.' and not '{{' in href:
104        if not urlsplit(href).scheme in ('http', 'https'):
105          if not href.endswith('.zip') and not 'javascript:' in href:
106            self.links.append(href)
107
108    if attrs.get('id'):
109      self.anchors.add(attrs['id'])
110    if attrs.get('name'):
111      self.anchors.add(attrs['name'])
112
113
114class LinkErrorDetector(object):
115  '''Finds link errors on the doc server. This includes broken links, those with
116  a target page that 404s or contain an anchor that doesn't exist, or pages that
117  have no links to them.
118  '''
119
120  def __init__(self, file_system, renderer, public_path, root_pages):
121    '''Creates a new broken link detector. |renderer| is a callable that takes
122    a path and returns a full html page. |public_path| is the path to public
123    template files. All URLs in |root_pages| are used as the starting nodes for
124    the orphaned page search.
125    '''
126    AssertIsDirectory(public_path)
127    self._file_system = file_system
128    self._renderer = renderer
129    self._public_path = public_path
130    self._pages = defaultdict(lambda: Page(404, (), (), ()))
131    self._root_pages = frozenset(root_pages)
132    self._always_detached = frozenset((
133        'apps/404.html',
134        'extensions/404.html',
135        'apps/private_apis.html',
136        'extensions/private_apis.html'))
137    self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
138
139    self._RenderAllPages()
140
141  def _RenderAllPages(self):
142    '''Traverses the public templates directory rendering each URL and
143    processing the resultant html to pull out all links and anchors.
144    '''
145    top_level_directories = (
146      ('docs/templates/public/', ''),
147      ('docs/static/', 'static/'),
148      ('docs/examples/', 'extensions/examples/'),
149    )
150
151    for dirpath, urlprefix in top_level_directories:
152      files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
153      for url, path in files:
154        self._pages[url] = _Process(url, self._renderer)
155
156        if self._pages[url].status != 200:
157          print(url, ', a url derived from the path', dirpath +
158              ', resulted in a', self._pages[url].status)
159
160  def _FollowRedirections(self, starting_url, limit=4):
161    '''Follow redirection until a non-redirectable page is reached. Start at
162    |starting_url| which must return a 301 or 302 status code.
163
164    Return a tuple of: the status of rendering |staring_url|, the final url,
165    and a list of the pages reached including |starting_url|. If no redirection
166    occurred, returns (None, None, None).
167    '''
168    pages_reached = [starting_url]
169    redirect_link = None
170    target_page = self._renderer(starting_url)
171    original_status = status = target_page.status
172    count = 0
173
174    while status in (301, 302):
175      if count > limit:
176        return None, None, None
177      redirect_link = target_page.headers.get('Location')
178      target_page = self._renderer(redirect_link)
179      status = target_page.status
180      pages_reached.append(redirect_link)
181      count += 1
182
183    if redirect_link is None:
184      return None, None, None
185
186    return original_status, redirect_link, pages_reached
187
188  def _CategorizeBrokenLinks(self, url, page, pages):
189    '''Find all broken links on a page and create appropriate notes describing
190    why tehy are broken (broken anchor, target redirects, etc). |page| is the
191    current page being checked and is the result of rendering |url|. |pages|
192    is a callable that takes a path and returns a Page.
193    '''
194    broken_links = []
195
196    for link in page.links + page.anchor_refs:
197      components = urlsplit(link)
198      fragment = components.fragment
199
200      if components.path == '':
201        if fragment == 'top' or fragment == '':
202          continue
203        if not fragment in page.anchors:
204          broken_links.append((200, url, link, 'target anchor not found'))
205      else:
206        # Render the target page
207        target_page = pages(components.path)
208
209        if target_page.status != 200:
210          if components.path in self._redirection_whitelist:
211            continue
212
213          status, relink, _ = self._FollowRedirections(components.path)
214          if relink:
215            broken_links.append((
216                status,
217                url,
218                link,
219                'redirects to %s' % relink))
220          else:
221            broken_links.append((
222                target_page.status, url, link, 'target page not found'))
223
224        elif fragment:
225          if not fragment in target_page.anchors:
226            broken_links.append((
227                target_page.status, url, link, 'target anchor not found'))
228
229    return broken_links
230
231  def GetBrokenLinks(self):
232    '''Find all broken links. A broken link is a link that leads to a page
233    that does not exist (404s), redirects to another page (301 or 302), or
234    has an anchor whose target does not exist.
235
236    Returns a list of tuples of four elements: status, url, target_page,
237    notes.
238    '''
239    broken_links = []
240
241    for url in self._pages.keys():
242      page = self._pages[url]
243      if page.status != 200:
244        continue
245      broken_links.extend(self._CategorizeBrokenLinks(
246          url, page, lambda x: self._pages[x]))
247
248    return broken_links
249
250  def GetOrphanedPages(self):
251    '''Crawls the server find all pages that are connected to the pages at
252    |seed_url|s. Return the links that are valid on the server but are not in
253    part of the connected component containing the |root_pages|. These pages
254    are orphans and cannot be reached simply by clicking through the server.
255    '''
256    pages_to_check = deque(self._root_pages.union(self._always_detached))
257    found = set(self._root_pages) | self._always_detached
258
259    while pages_to_check:
260      item = pages_to_check.popleft()
261      target_page = self._pages[item]
262
263      if target_page.status != 200:
264        redirected_page = self._FollowRedirections(item)[1]
265        if not redirected_page is None:
266          target_page = self._pages[redirected_page]
267
268      for link in target_page.links:
269        if link not in found:
270          found.add(link)
271          pages_to_check.append(link)
272
273    all_urls = set(
274        [url for url, page in self._pages.iteritems() if page.status == 200])
275
276    return [url for url in all_urls - found if url.endswith('.html')]
277
278
279def StringifyBrokenLinks(broken_links):
280  '''Prints out broken links in a more readable format.
281  '''
282  def fixed_width(string, width):
283    return "%s%s" % (string, (width - len(string)) * ' ')
284
285  first_col_width = max(len(link[1]) for link in broken_links)
286  second_col_width = max(len(link[2]) for link in broken_links)
287  target = itemgetter(2)
288  output = []
289
290  def pretty_print(link, col_offset=0):
291    return "%s -> %s %s" % (
292        fixed_width(link[1], first_col_width - col_offset),
293        fixed_width(link[2], second_col_width),
294        link[3])
295
296  for target, links in groupby(sorted(broken_links, key=target), target):
297    links = list(links)
298    # Compress messages
299    if len(links) > 50 and not links[0][2].startswith('#'):
300      message = "Found %d broken links (" % len(links)
301      output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
302    else:
303      for link in links:
304        output.append(pretty_print(link))
305
306  return '\n'.join(output)
307