1# Copyright 2013 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from collections import defaultdict, deque, namedtuple 6from HTMLParser import HTMLParser, HTMLParseError 7from itertools import groupby 8from operator import itemgetter 9import posixpath 10from urlparse import urlsplit 11 12from file_system_util import CreateURLsFromPaths 13import svn_constants 14 15Page = namedtuple('Page', 'status, links, anchors, anchor_refs') 16 17def _SplitAnchor(url): 18 components = urlsplit(url) 19 return components.path, components.fragment 20 21def _Process(path, renderer): 22 '''Render the page at |path| using a |renderer| and process the contents of 23 that page. Returns a |Page| namedtuple with fields for the http status code 24 of the page render, the href of all the links that occurred on the page, all 25 of the anchors on the page (ids and names), and all links that contain an 26 anchor component. 27 28 If a non-html page is properly rendered, a |Page| with status code 200 and 29 all other fields empty is returned. 30 ''' 31 parser = _ContentParser() 32 response = renderer(path) 33 34 if response.status != 200: 35 return Page(response.status, (), (), ()) 36 if not path.endswith('.html'): 37 return Page(200, (), (), ()) 38 39 try: 40 parser.feed(str(response.content)) 41 except HTMLParseError: 42 return Page(200, (), (), ()) 43 44 links, anchors = parser.links, parser.anchors 45 base, _ = path.rsplit('/', 1) 46 edges = [] 47 anchor_refs = [] 48 49 # Convert relative links to absolute links and categorize links as edges 50 # or anchor_refs. 51 for link in links: 52 # Files like experimental_history.html are refered to with the URL 53 # experimental.history.html. 54 head, last = link.rsplit('/', 1) if '/' in link else ('', link) 55 last, anchor = _SplitAnchor(last) 56 57 if last.endswith('.html') and last.count('.') > 1: 58 last = last.replace('.', '_', last.count('.') - 1) 59 link = posixpath.join(head, last) 60 if anchor: 61 link = '%s#%s' % (link, anchor) 62 63 if link.startswith('#'): 64 anchor_refs.append(link) 65 else: 66 if link.startswith('/'): 67 link = link[1:] 68 else: 69 link = posixpath.normpath('%s/%s' % (base, link)) 70 71 if '#' in link: 72 anchor_refs.append(link) 73 else: 74 edges.append(link) 75 76 return Page(200, edges, anchors, anchor_refs) 77 78class _ContentParser(HTMLParser): 79 '''Parse an html file pulling out all links and anchor_refs, where an 80 anchor_ref is a link that contains an anchor. 81 ''' 82 83 def __init__(self): 84 HTMLParser.__init__(self) 85 self.links = [] 86 self.anchors = set() 87 88 def handle_starttag(self, tag, raw_attrs): 89 attrs = dict(raw_attrs) 90 91 if tag == 'a': 92 # Handle special cases for href's that: start with a space, contain 93 # just a '.' (period), contain python templating code, are an absolute 94 # url, are a zip file, or execute javascript on the page. 95 href = attrs.get('href', '').strip() 96 if href and not href == '.' and not '{{' in href: 97 if not urlsplit(href).scheme in ('http', 'https'): 98 if not href.endswith('.zip') and not 'javascript:' in href: 99 self.links.append(href) 100 101 if attrs.get('id'): 102 self.anchors.add(attrs['id']) 103 if attrs.get('name'): 104 self.anchors.add(attrs['name']) 105 106class LinkErrorDetector(object): 107 '''Finds link errors on the doc server. This includes broken links, those with 108 a target page that 404s or contain an anchor that doesn't exist, or pages that 109 have no links to them. 110 ''' 111 112 def __init__(self, file_system, renderer, public_path, root_pages): 113 '''Creates a new broken link detector. |renderer| is a callable that takes 114 a path and returns a full html page. |public_path| is the path to public 115 template files. All URLs in |root_pages| are used as the starting nodes for 116 the orphaned page search. 117 ''' 118 self._file_system = file_system 119 self._renderer = renderer 120 self._public_path = public_path 121 self._pages = defaultdict(lambda: Page(404, (), (), ())) 122 self._root_pages = frozenset(root_pages) 123 self._always_detached = frozenset(( 124 'apps/404.html', 125 'extensions/404.html', 126 'apps/private_apis.html', 127 'extensions/private_apis.html')) 128 self._redirection_whitelist = frozenset(('extensions/', 'apps/')) 129 130 self._RenderAllPages() 131 132 def _RenderAllPages(self): 133 '''Traverses the public templates directory rendering each URL and 134 processing the resultant html to pull out all links and anchors. 135 ''' 136 top_level_directories = ( 137 (svn_constants.PUBLIC_TEMPLATE_PATH, ''), 138 (svn_constants.STATIC_PATH, 'static/'), 139 (svn_constants.EXAMPLES_PATH, 'extensions/examples/'), 140 ) 141 142 for dirpath, urlprefix in top_level_directories: 143 files = CreateURLsFromPaths(self._file_system, dirpath, urlprefix) 144 for url, path in files: 145 self._pages[url] = _Process(url, self._renderer) 146 147 if self._pages[url].status != 200: 148 print(url, ', a url derived from the path', dirpath + 149 ', resulted in a', self._pages[url].status) 150 151 def _FollowRedirections(self, starting_url, limit=4): 152 '''Follow redirection until a non-redirectable page is reached. Start at 153 |starting_url| which must return a 301 or 302 status code. 154 155 Return a tuple of: the status of rendering |staring_url|, the final url, 156 and a list of the pages reached including |starting_url|. If no redirection 157 occurred, returns (None, None, None). 158 ''' 159 pages_reached = [starting_url] 160 redirect_link = None 161 target_page = self._renderer(starting_url) 162 original_status = status = target_page.status 163 count = 0 164 165 while status in (301, 302): 166 if count > limit: 167 return None, None, None 168 redirect_link = target_page.headers.get('Location') 169 target_page = self._renderer(redirect_link) 170 status = target_page.status 171 pages_reached.append(redirect_link) 172 count += 1 173 174 if redirect_link is None: 175 return None, None, None 176 177 return original_status, redirect_link, pages_reached 178 179 def _CategorizeBrokenLinks(self, url, page, pages): 180 '''Find all broken links on a page and create appropriate notes describing 181 why tehy are broken (broken anchor, target redirects, etc). |page| is the 182 current page being checked and is the result of rendering |url|. |pages| 183 is a callable that takes a path and returns a Page. 184 ''' 185 broken_links = [] 186 187 for link in page.links + page.anchor_refs: 188 components = urlsplit(link) 189 fragment = components.fragment 190 191 if components.path == '': 192 if fragment == 'top' or fragment == '': 193 continue 194 if not fragment in page.anchors: 195 broken_links.append((200, url, link, 'target anchor not found')) 196 else: 197 # Render the target page 198 target_page = pages(components.path) 199 200 if target_page.status != 200: 201 if components.path in self._redirection_whitelist: 202 continue 203 204 status, relink, _ = self._FollowRedirections(components.path) 205 if relink: 206 broken_links.append(( 207 status, 208 url, 209 link, 210 'redirects to %s' % relink)) 211 else: 212 broken_links.append(( 213 target_page.status, url, link, 'target page not found')) 214 215 elif fragment: 216 if not fragment in target_page.anchors: 217 broken_links.append(( 218 target_page.status, url, link, 'target anchor not found')) 219 220 return broken_links 221 222 def GetBrokenLinks(self): 223 '''Find all broken links. A broken link is a link that leads to a page 224 that does not exist (404s), redirects to another page (301 or 302), or 225 has an anchor whose target does not exist. 226 227 Returns a list of tuples of four elements: status, url, target_page, 228 notes. 229 ''' 230 broken_links = [] 231 232 for url in self._pages.keys(): 233 page = self._pages[url] 234 if page.status != 200: 235 continue 236 broken_links.extend(self._CategorizeBrokenLinks( 237 url, page, lambda x: self._pages[x])) 238 239 return broken_links 240 241 def GetOrphanedPages(self): 242 '''Crawls the server find all pages that are connected to the pages at 243 |seed_url|s. Return the links that are valid on the server but are not in 244 part of the connected component containing the |root_pages|. These pages 245 are orphans and cannot be reached simply by clicking through the server. 246 ''' 247 pages_to_check = deque(self._root_pages.union(self._always_detached)) 248 found = set(self._root_pages) | self._always_detached 249 250 while pages_to_check: 251 item = pages_to_check.popleft() 252 target_page = self._pages[item] 253 254 if target_page.status != 200: 255 redirected_page = self._FollowRedirections(item)[1] 256 if not redirected_page is None: 257 target_page = self._pages[redirected_page] 258 259 for link in target_page.links: 260 if link not in found: 261 found.add(link) 262 pages_to_check.append(link) 263 264 all_urls = set( 265 [url for url, page in self._pages.iteritems() if page.status == 200]) 266 267 return [url for url in all_urls - found if url.endswith('.html')] 268 269def StringifyBrokenLinks(broken_links): 270 '''Prints out broken links in a more readable format. 271 ''' 272 def fixed_width(string, width): 273 return "%s%s" % (string, (width - len(string)) * ' ') 274 275 first_col_width = max(len(link[1]) for link in broken_links) 276 second_col_width = max(len(link[2]) for link in broken_links) 277 target = itemgetter(2) 278 output = [] 279 280 def pretty_print(link, col_offset=0): 281 return "%s -> %s %s" % ( 282 fixed_width(link[1], first_col_width - col_offset), 283 fixed_width(link[2], second_col_width), 284 link[3]) 285 286 for target, links in groupby(sorted(broken_links, key=target), target): 287 links = list(links) 288 # Compress messages 289 if len(links) > 50 and not links[0][2].startswith('#'): 290 message = "Found %d broken links (" % len(links) 291 output.append("%s%s)" % (message, pretty_print(links[0], len(message)))) 292 else: 293 for link in links: 294 output.append(pretty_print(link)) 295 296 return '\n'.join(output) 297