1ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch# Copyright 2013 The Chromium Authors. All rights reserved. 2ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch# Use of this source code is governed by a BSD-style license that can be 3ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch# found in the LICENSE file. 4ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 5ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochfrom collections import defaultdict, deque, namedtuple 6ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochfrom HTMLParser import HTMLParser, HTMLParseError 7bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdochfrom itertools import groupby 8bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdochfrom operator import itemgetter 9ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochimport posixpath 10ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochfrom urlparse import urlsplit 11ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 12ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochfrom file_system_util import CreateURLsFromPaths 135d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)from path_util import AssertIsDirectory 14f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 15ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 16ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben MurdochPage = namedtuple('Page', 'status, links, anchors, anchor_refs') 17ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 18f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 19ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochdef _SplitAnchor(url): 20ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch components = urlsplit(url) 21ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch return components.path, components.fragment 22ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 23f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 24ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochdef _Process(path, renderer): 25ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch '''Render the page at |path| using a |renderer| and process the contents of 26ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch that page. Returns a |Page| namedtuple with fields for the http status code 27ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch of the page render, the href of all the links that occurred on the page, all 28ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch of the anchors on the page (ids and names), and all links that contain an 29ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch anchor component. 30ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 31ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch If a non-html page is properly rendered, a |Page| with status code 200 and 32ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch all other fields empty is returned. 33ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch ''' 34ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch parser = _ContentParser() 35ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch response = renderer(path) 36ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 37ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if response.status != 200: 38ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch return Page(response.status, (), (), ()) 39ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if not path.endswith('.html'): 40ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch return Page(200, (), (), ()) 41ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 42ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch try: 43ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch parser.feed(str(response.content)) 44ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch except HTMLParseError: 45ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch return Page(200, (), (), ()) 46ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 47ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch links, anchors = parser.links, parser.anchors 4858537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles) if '/' in path: 4958537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles) base, _ = path.rsplit('/', 1) 5058537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles) else: 5158537e28ecd584eab876aee8be7156509866d23aTorne (Richard Coles) base = '' 52ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch edges = [] 53ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch anchor_refs = [] 54ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 55ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch # Convert relative links to absolute links and categorize links as edges 56ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch # or anchor_refs. 57ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch for link in links: 58ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch # Files like experimental_history.html are refered to with the URL 59ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch # experimental.history.html. 60ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch head, last = link.rsplit('/', 1) if '/' in link else ('', link) 61ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch last, anchor = _SplitAnchor(last) 62ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 63ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if last.endswith('.html') and last.count('.') > 1: 64ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch last = last.replace('.', '_', last.count('.') - 1) 65ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch link = posixpath.join(head, last) 66ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if anchor: 67ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch link = '%s#%s' % (link, anchor) 68ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 69ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if link.startswith('#'): 70ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch anchor_refs.append(link) 71ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch else: 72ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if link.startswith('/'): 73ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch link = link[1:] 74ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch else: 75ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch link = posixpath.normpath('%s/%s' % (base, link)) 76ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 77ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if '#' in link: 78ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch anchor_refs.append(link) 79ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch else: 80ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch edges.append(link) 81ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 82ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch return Page(200, edges, anchors, anchor_refs) 83ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 84f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 85ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochclass _ContentParser(HTMLParser): 86ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch '''Parse an html file pulling out all links and anchor_refs, where an 87ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch anchor_ref is a link that contains an anchor. 88ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch ''' 89ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 90ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch def __init__(self): 91ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch HTMLParser.__init__(self) 92ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self.links = [] 93ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self.anchors = set() 94ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 95ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch def handle_starttag(self, tag, raw_attrs): 96ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch attrs = dict(raw_attrs) 97ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 98ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if tag == 'a': 99ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch # Handle special cases for href's that: start with a space, contain 100ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch # just a '.' (period), contain python templating code, are an absolute 101ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch # url, are a zip file, or execute javascript on the page. 102ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch href = attrs.get('href', '').strip() 103ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if href and not href == '.' and not '{{' in href: 104ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if not urlsplit(href).scheme in ('http', 'https'): 105ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if not href.endswith('.zip') and not 'javascript:' in href: 106ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self.links.append(href) 107ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 108ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if attrs.get('id'): 109ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self.anchors.add(attrs['id']) 110ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if attrs.get('name'): 111ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self.anchors.add(attrs['name']) 112ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 113f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 114ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdochclass LinkErrorDetector(object): 115ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch '''Finds link errors on the doc server. This includes broken links, those with 116ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch a target page that 404s or contain an anchor that doesn't exist, or pages that 117ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch have no links to them. 118ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch ''' 119ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 120ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch def __init__(self, file_system, renderer, public_path, root_pages): 121ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch '''Creates a new broken link detector. |renderer| is a callable that takes 122ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch a path and returns a full html page. |public_path| is the path to public 123ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch template files. All URLs in |root_pages| are used as the starting nodes for 124ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch the orphaned page search. 125ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch ''' 1265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) AssertIsDirectory(public_path) 127ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self._file_system = file_system 128ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self._renderer = renderer 129ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self._public_path = public_path 130ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self._pages = defaultdict(lambda: Page(404, (), (), ())) 131ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self._root_pages = frozenset(root_pages) 132a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles) self._always_detached = frozenset(( 133a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles) 'apps/404.html', 134a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles) 'extensions/404.html', 135a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles) 'apps/private_apis.html', 136a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles) 'extensions/private_apis.html')) 137558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch self._redirection_whitelist = frozenset(('extensions/', 'apps/')) 138ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 139ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self._RenderAllPages() 140ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 141ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch def _RenderAllPages(self): 142ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch '''Traverses the public templates directory rendering each URL and 143ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch processing the resultant html to pull out all links and anchors. 144ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch ''' 145ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch top_level_directories = ( 1465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ('docs/templates/public/', ''), 1475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ('docs/static/', 'static/'), 1485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ('docs/examples/', 'extensions/examples/'), 149ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch ) 150ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 151ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch for dirpath, urlprefix in top_level_directories: 152ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) 153ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch for url, path in files: 154ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch self._pages[url] = _Process(url, self._renderer) 155ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 156ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if self._pages[url].status != 200: 157ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch print(url, ', a url derived from the path', dirpath + 158ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch ', resulted in a', self._pages[url].status) 159ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 160558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch def _FollowRedirections(self, starting_url, limit=4): 161558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch '''Follow redirection until a non-redirectable page is reached. Start at 162558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch |starting_url| which must return a 301 or 302 status code. 163558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 164558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch Return a tuple of: the status of rendering |staring_url|, the final url, 165558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch and a list of the pages reached including |starting_url|. If no redirection 166558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch occurred, returns (None, None, None). 167558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch ''' 168558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch pages_reached = [starting_url] 169558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch redirect_link = None 170558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch target_page = self._renderer(starting_url) 171558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch original_status = status = target_page.status 172558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch count = 0 173558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 174558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch while status in (301, 302): 175558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch if count > limit: 176558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch return None, None, None 177558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch redirect_link = target_page.headers.get('Location') 178558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch target_page = self._renderer(redirect_link) 179558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch status = target_page.status 180558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch pages_reached.append(redirect_link) 181558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch count += 1 182558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 183558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch if redirect_link is None: 184558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch return None, None, None 185558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 186558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch return original_status, redirect_link, pages_reached 187558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 188558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch def _CategorizeBrokenLinks(self, url, page, pages): 189558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch '''Find all broken links on a page and create appropriate notes describing 190558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch why tehy are broken (broken anchor, target redirects, etc). |page| is the 191558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch current page being checked and is the result of rendering |url|. |pages| 192558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch is a callable that takes a path and returns a Page. 193558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch ''' 194558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch broken_links = [] 195558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 196558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch for link in page.links + page.anchor_refs: 197558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch components = urlsplit(link) 198558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch fragment = components.fragment 199558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 200558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch if components.path == '': 201a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles) if fragment == 'top' or fragment == '': 202558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch continue 203558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch if not fragment in page.anchors: 204558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch broken_links.append((200, url, link, 'target anchor not found')) 205558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch else: 206558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch # Render the target page 207558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch target_page = pages(components.path) 208558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 209558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch if target_page.status != 200: 210558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch if components.path in self._redirection_whitelist: 211558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch continue 212558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 213558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch status, relink, _ = self._FollowRedirections(components.path) 214558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch if relink: 215558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch broken_links.append(( 216558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch status, 217558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch url, 218558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch link, 219558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 'redirects to %s' % relink)) 220a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles) else: 221558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch broken_links.append(( 222558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch target_page.status, url, link, 'target page not found')) 223558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 224558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch elif fragment: 225558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch if not fragment in target_page.anchors: 226558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch broken_links.append(( 227558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch target_page.status, url, link, 'target anchor not found')) 228558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 229558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch return broken_links 230558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 231ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch def GetBrokenLinks(self): 232558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch '''Find all broken links. A broken link is a link that leads to a page 233558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch that does not exist (404s), redirects to another page (301 or 302), or 234558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch has an anchor whose target does not exist. 235558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 236558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch Returns a list of tuples of four elements: status, url, target_page, 237558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch notes. 238ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch ''' 239ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch broken_links = [] 240ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 241ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch for url in self._pages.keys(): 242ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch page = self._pages[url] 243ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if page.status != 200: 244ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch continue 245558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch broken_links.extend(self._CategorizeBrokenLinks( 246558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch url, page, lambda x: self._pages[x])) 247ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 248558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch return broken_links 249ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 250ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch def GetOrphanedPages(self): 251ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch '''Crawls the server find all pages that are connected to the pages at 252ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch |seed_url|s. Return the links that are valid on the server but are not in 253ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch part of the connected component containing the |root_pages|. These pages 254ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch are orphans and cannot be reached simply by clicking through the server. 255ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch ''' 256a36e5920737c6adbddd3e43b760e5de8431db6e0Torne (Richard Coles) pages_to_check = deque(self._root_pages.union(self._always_detached)) 257ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch found = set(self._root_pages) | self._always_detached 258ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 259ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch while pages_to_check: 260ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch item = pages_to_check.popleft() 261558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch target_page = self._pages[item] 262558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 263558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch if target_page.status != 200: 264558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch redirected_page = self._FollowRedirections(item)[1] 265558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch if not redirected_page is None: 266558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch target_page = self._pages[redirected_page] 267558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch 268558790d6acca3451cf3a6b497803a5f07d0bec58Ben Murdoch for link in target_page.links: 269ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch if link not in found: 270ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch found.add(link) 271ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch pages_to_check.append(link) 272ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 273ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch all_urls = set( 274ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch [url for url, page in self._pages.iteritems() if page.status == 200]) 275ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch 276ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch return [url for url in all_urls - found if url.endswith('.html')] 277bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch 278f2477e01787aa58f445919b809d89e252beef54fTorne (Richard Coles) 279bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdochdef StringifyBrokenLinks(broken_links): 280bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch '''Prints out broken links in a more readable format. 281bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch ''' 282bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch def fixed_width(string, width): 283bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch return "%s%s" % (string, (width - len(string)) * ' ') 284bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch 285bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch first_col_width = max(len(link[1]) for link in broken_links) 286bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch second_col_width = max(len(link[2]) for link in broken_links) 287bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch target = itemgetter(2) 288bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch output = [] 289bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch 290bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch def pretty_print(link, col_offset=0): 291bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch return "%s -> %s %s" % ( 292bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch fixed_width(link[1], first_col_width - col_offset), 293bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch fixed_width(link[2], second_col_width), 294bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch link[3]) 295bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch 296bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch for target, links in groupby(sorted(broken_links, key=target), target): 297bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch links = list(links) 298bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch # Compress messages 299bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch if len(links) > 50 and not links[0][2].startswith('#'): 300bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch message = "Found %d broken links (" % len(links) 301bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch output.append("%s%s)" % (message, pretty_print(links[0], len(message)))) 302bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch else: 303bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch for link in links: 304bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch output.append(pretty_print(link)) 305bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch 306bb1529ce867d8845a77ec7cdf3e3003ef1771a40Ben Murdoch return '\n'.join(output) 307