1# Copyright 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from collections import defaultdict, deque, namedtuple
6from HTMLParser import HTMLParser, HTMLParseError
7from itertools import groupby
8from operator import itemgetter
9import posixpath
10from urlparse import urlsplit
11
12from file_system_util import CreateURLsFromPaths
13import svn_constants
14
15Page = namedtuple('Page', 'status, links, anchors, anchor_refs')
16
17def _SplitAnchor(url):
18  components = urlsplit(url)
19  return components.path, components.fragment
20
21def _Process(path, renderer):
22  '''Render the page at |path| using a |renderer| and process the contents of
23  that page. Returns a |Page| namedtuple with fields for the http status code
24  of the page render, the href of all the links that occurred on the page, all
25  of the anchors on the page (ids and names), and all links that contain an
26  anchor component.
27
28  If a non-html page is properly rendered, a |Page| with status code 200 and
29  all other fields empty is returned.
30  '''
31  parser = _ContentParser()
32  response = renderer(path)
33
34  if response.status != 200:
35    return Page(response.status, (), (), ())
36  if not path.endswith('.html'):
37    return Page(200, (), (), ())
38
39  try:
40    parser.feed(str(response.content))
41  except HTMLParseError:
42    return Page(200, (), (), ())
43
44  links, anchors = parser.links, parser.anchors
45  base, _ = path.rsplit('/', 1)
46  edges = []
47  anchor_refs = []
48
49  # Convert relative links to absolute links and categorize links as edges
50  # or anchor_refs.
51  for link in links:
52    # Files like experimental_history.html are refered to with the URL
53    # experimental.history.html.
54    head, last = link.rsplit('/', 1) if '/' in link else ('', link)
55    last, anchor = _SplitAnchor(last)
56
57    if last.endswith('.html') and last.count('.') > 1:
58      last = last.replace('.', '_', last.count('.') - 1)
59      link = posixpath.join(head, last)
60      if anchor:
61        link = '%s#%s' % (link, anchor)
62
63    if link.startswith('#'):
64      anchor_refs.append(link)
65    else:
66      if link.startswith('/'):
67        link = link[1:]
68      else:
69        link = posixpath.normpath('%s/%s' % (base, link))
70
71      if '#' in link:
72        anchor_refs.append(link)
73      else:
74        edges.append(link)
75
76  return Page(200, edges, anchors, anchor_refs)
77
78class _ContentParser(HTMLParser):
79  '''Parse an html file pulling out all links and anchor_refs, where an
80  anchor_ref is a link that contains an anchor.
81  '''
82
83  def __init__(self):
84    HTMLParser.__init__(self)
85    self.links = []
86    self.anchors = set()
87
88  def handle_starttag(self, tag, raw_attrs):
89    attrs = dict(raw_attrs)
90
91    if tag == 'a':
92      # Handle special cases for href's that: start with a space, contain
93      # just a '.' (period), contain python templating code, are an absolute
94      # url, are a zip file, or execute javascript on the page.
95      href = attrs.get('href', '').strip()
96      if href and not href == '.' and not '{{' in href:
97        if not urlsplit(href).scheme in ('http', 'https'):
98          if not href.endswith('.zip') and not 'javascript:' in href:
99            self.links.append(href)
100
101    if attrs.get('id'):
102      self.anchors.add(attrs['id'])
103    if attrs.get('name'):
104      self.anchors.add(attrs['name'])
105
106class LinkErrorDetector(object):
107  '''Finds link errors on the doc server. This includes broken links, those with
108  a target page that 404s or contain an anchor that doesn't exist, or pages that
109  have no links to them.
110  '''
111
112  def __init__(self, file_system, renderer, public_path, root_pages):
113    '''Creates a new broken link detector. |renderer| is a callable that takes
114    a path and returns a full html page. |public_path| is the path to public
115    template files. All URLs in |root_pages| are used as the starting nodes for
116    the orphaned page search.
117    '''
118    self._file_system = file_system
119    self._renderer = renderer
120    self._public_path = public_path
121    self._pages = defaultdict(lambda: Page(404, (), (), ()))
122    self._root_pages = frozenset(root_pages)
123    self._always_detached = frozenset((
124        'apps/404.html',
125        'extensions/404.html',
126        'apps/private_apis.html',
127        'extensions/private_apis.html'))
128    self._redirection_whitelist = frozenset(('extensions/', 'apps/'))
129
130    self._RenderAllPages()
131
132  def _RenderAllPages(self):
133    '''Traverses the public templates directory rendering each URL and
134    processing the resultant html to pull out all links and anchors.
135    '''
136    top_level_directories = (
137      (svn_constants.PUBLIC_TEMPLATE_PATH, ''),
138      (svn_constants.STATIC_PATH, 'static/'),
139      (svn_constants.EXAMPLES_PATH, 'extensions/examples/'),
140    )
141
142    for dirpath, urlprefix in top_level_directories:
143      files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix)
144      for url, path in files:
145        self._pages[url] = _Process(url, self._renderer)
146
147        if self._pages[url].status != 200:
148          print(url, ', a url derived from the path', dirpath +
149              ', resulted in a', self._pages[url].status)
150
151  def _FollowRedirections(self, starting_url, limit=4):
152    '''Follow redirection until a non-redirectable page is reached. Start at
153    |starting_url| which must return a 301 or 302 status code.
154
155    Return a tuple of: the status of rendering |staring_url|, the final url,
156    and a list of the pages reached including |starting_url|. If no redirection
157    occurred, returns (None, None, None).
158    '''
159    pages_reached = [starting_url]
160    redirect_link = None
161    target_page = self._renderer(starting_url)
162    original_status = status = target_page.status
163    count = 0
164
165    while status in (301, 302):
166      if count > limit:
167        return None, None, None
168      redirect_link = target_page.headers.get('Location')
169      target_page = self._renderer(redirect_link)
170      status = target_page.status
171      pages_reached.append(redirect_link)
172      count += 1
173
174    if redirect_link is None:
175      return None, None, None
176
177    return original_status, redirect_link, pages_reached
178
179  def _CategorizeBrokenLinks(self, url, page, pages):
180    '''Find all broken links on a page and create appropriate notes describing
181    why tehy are broken (broken anchor, target redirects, etc). |page| is the
182    current page being checked and is the result of rendering |url|. |pages|
183    is a callable that takes a path and returns a Page.
184    '''
185    broken_links = []
186
187    for link in page.links + page.anchor_refs:
188      components = urlsplit(link)
189      fragment = components.fragment
190
191      if components.path == '':
192        if fragment == 'top' or fragment == '':
193          continue
194        if not fragment in page.anchors:
195          broken_links.append((200, url, link, 'target anchor not found'))
196      else:
197        # Render the target page
198        target_page = pages(components.path)
199
200        if target_page.status != 200:
201          if components.path in self._redirection_whitelist:
202            continue
203
204          status, relink, _ = self._FollowRedirections(components.path)
205          if relink:
206            broken_links.append((
207                status,
208                url,
209                link,
210                'redirects to %s' % relink))
211          else:
212            broken_links.append((
213                target_page.status, url, link, 'target page not found'))
214
215        elif fragment:
216          if not fragment in target_page.anchors:
217            broken_links.append((
218                target_page.status, url, link, 'target anchor not found'))
219
220    return broken_links
221
222  def GetBrokenLinks(self):
223    '''Find all broken links. A broken link is a link that leads to a page
224    that does not exist (404s), redirects to another page (301 or 302), or
225    has an anchor whose target does not exist.
226
227    Returns a list of tuples of four elements: status, url, target_page,
228    notes.
229    '''
230    broken_links = []
231
232    for url in self._pages.keys():
233      page = self._pages[url]
234      if page.status != 200:
235        continue
236      broken_links.extend(self._CategorizeBrokenLinks(
237          url, page, lambda x: self._pages[x]))
238
239    return broken_links
240
241  def GetOrphanedPages(self):
242    '''Crawls the server find all pages that are connected to the pages at
243    |seed_url|s. Return the links that are valid on the server but are not in
244    part of the connected component containing the |root_pages|. These pages
245    are orphans and cannot be reached simply by clicking through the server.
246    '''
247    pages_to_check = deque(self._root_pages.union(self._always_detached))
248    found = set(self._root_pages) | self._always_detached
249
250    while pages_to_check:
251      item = pages_to_check.popleft()
252      target_page = self._pages[item]
253
254      if target_page.status != 200:
255        redirected_page = self._FollowRedirections(item)[1]
256        if not redirected_page is None:
257          target_page = self._pages[redirected_page]
258
259      for link in target_page.links:
260        if link not in found:
261          found.add(link)
262          pages_to_check.append(link)
263
264    all_urls = set(
265        [url for url, page in self._pages.iteritems() if page.status == 200])
266
267    return [url for url in all_urls - found if url.endswith('.html')]
268
269def StringifyBrokenLinks(broken_links):
270  '''Prints out broken links in a more readable format.
271  '''
272  def fixed_width(string, width):
273    return "%s%s" % (string, (width - len(string)) * ' ')
274
275  first_col_width = max(len(link[1]) for link in broken_links)
276  second_col_width = max(len(link[2]) for link in broken_links)
277  target = itemgetter(2)
278  output = []
279
280  def pretty_print(link, col_offset=0):
281    return "%s -> %s %s" % (
282        fixed_width(link[1], first_col_width - col_offset),
283        fixed_width(link[2], second_col_width),
284        link[3])
285
286  for target, links in groupby(sorted(broken_links, key=target), target):
287    links = list(links)
288    # Compress messages
289    if len(links) > 50 and not links[0][2].startswith('#'):
290      message = "Found %d broken links (" % len(links)
291      output.append("%s%s)" % (message, pretty_print(links[0], len(message))))
292    else:
293      for link in links:
294        output.append(pretty_print(link))
295
296  return '\n'.join(output)
297