1a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved. 2a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 3a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)// found in the LICENSE file. 4a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 5a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)// Local modifications to this file are described in the README.chromium 6a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)// file. 7a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 8a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)var dbg = (typeof console !== 'undefined') ? function(s) { 9a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) console.log("Readability: " + s); 10a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)} : function() {}; 11a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 12a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)/* 13a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Readability. An Arc90 Lab Experiment. 14a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Website: http://lab.arc90.com/experiments/readability 15a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Source: http://code.google.com/p/arc90labs-readability 16a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 17a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission. 18a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 19a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Copyright (c) 2010 Arc90 Inc 20a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Readability is licensed under the Apache License, Version 2.0. 21a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)**/ 22a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)var readability = { 23a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readStyle: "style-newspaper", 24a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readSize: "size-medium", 25a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readMargin: "margin-wide", 26a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 27a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) distilledHTML: '', 28a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) distilledArticleContent: null, 29a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nextPageLink: '', 30a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 31a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) version: '1.7.1', 32a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) iframeLoads: 0, 33a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) convertLinksToFootnotes: false, 34a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) reversePageScroll: false, /* If they hold shift and hit space, scroll up */ 35a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) frameHack: false, /** 36a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * The frame hack is to workaround a firefox bug where if you 37a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. 38a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * So we fake a scrollbar in the wrapping div. 39a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 40a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) biggestFrame: false, 41a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ 42a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 43a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* constants */ 44a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) FLAG_STRIP_UNLIKELYS: 0x1, 45a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) FLAG_WEIGHT_CLASSES: 0x2, 46a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) FLAG_CLEAN_CONDITIONALLY: 0x4, 47a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 48a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ 49a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ 50a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ 51a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 52a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 53a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * All of the regular expressions in use within readability. 54a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Defined up here so we don't instantiate them repeatedly in loops. 55a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 56a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) regexps: { 57a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, 58a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) okMaybeItsACandidate: /and|article|body|column|main|shadow/i, 59a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, 60a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, 61a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i, 62a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, 63a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, 64a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) replaceFonts: /<(\/?)font[^>]*>/gi, 65a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) trim: /^\s+|\s+$/g, 66a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) normalize: /\s{2,}/g, 67a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, 68a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, 69a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, 70a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. 71a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) prevLink: /(prev|earl|old|new|<|«)/i 72a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 73a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 74a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 75a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Runs readability. 76a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 77a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Workflow: 78a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 1. Prep the document by removing script tags, css, etc. 79a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 2. Build readability's DOM tree. 80a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 3. Grab the article content from the current dom tree. 81a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 4. Replace the current DOM tree with the new one. 82a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 5. Read peacefully. 83a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 84a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 85a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 86a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) init: function() { 87a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Before we do anything, remove all scripts that are not readability. */ 88a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) window.onload = window.onunload = function() {}; 89a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 90a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.removeScripts(document); 91a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 92a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */ 93a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; 94a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 95a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Pull out any possible next page link first */ 96a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.nextPageLink = readability.findNextPageLink(document.body); 97a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 98a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* We handle processing of nextPage from C++ set nextPageLink to null */ 99a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var nextPageLink = null; 100a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 101a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.prepDocument(); 102a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 103a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Build readability's DOM tree */ 104a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var overlay = document.createElement("DIV"); 105a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var innerDiv = document.createElement("DIV"); 106a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var articleTools = readability.getArticleTools(); 107a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var articleTitleText = readability.getArticleTitle(); 108a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var articleContent = readability.grabArticle(); 109a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 110a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!articleContent) { 111a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleContent = document.createElement("DIV"); 112a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleContent.id = "readability-content"; 113a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleContent.innerHTML = [ 114a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>", 115a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""), 116a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>" 117a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) ].join(''); 118a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 119a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nextPageLink = null; 120a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 121a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 122a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) overlay.id = "readOverlay"; 123a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) innerDiv.id = "readInner"; 124a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 125a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Apply user-selected styling */ 126a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) document.body.className = readability.readStyle; 127a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) document.dir = readability.getSuggestedDirection(articleTitleText); 128a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 129a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){ 130a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) overlay.className = readability.readStyle + " rdbTypekit"; 131a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else { 132a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) overlay.className = readability.readStyle; 133a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 134a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) innerDiv.className = readability.readMargin + " " + readability.readSize; 135a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 136a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) { 137a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.convertLinksToFootnotes = true; 138a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 139a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 140a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.distilledHTML = articleContent.innerHTML; 141a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 142a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(readability.frameHack) { 143a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var readOverlay = document.getElementById('readOverlay'); 144a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readOverlay.style.height = '100%'; 145a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readOverlay.style.overflow = 'auto'; 146a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 147a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 148a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 149a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * If someone tries to use Readability on a site's root page, give them a warning about usage. 150a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 151a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) { 152a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleContent.style.display = "none"; 153a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var rootWarning = document.createElement('p'); 154a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) rootWarning.id = "readability-warning"; 155a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " + 156a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue."; 157a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 158a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) innerDiv.insertBefore( rootWarning, articleContent ); 159a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 160a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 161a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.postProcessContent(articleContent); 162a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 163a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) window.scrollTo(0, 0); 164a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 165a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (nextPageLink) { 166a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 167a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Append any additional pages after a small timeout so that people 168a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * can start reading without having to wait for this to finish processing. 169a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 170a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) window.setTimeout(function() { 171a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.appendNextPage(nextPageLink); 172a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 500); 173a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 174a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 175a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** Smooth scrolling **/ 176a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) document.onkeydown = function(e) { 177a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var code = (window.event) ? event.keyCode : e.keyCode; 178a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (code === 16) { 179a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.reversePageScroll = true; 180a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return; 181a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 182a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 183a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (code === 32) { 184a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.curScrollStep = 0; 185a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight); 186a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 187a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(readability.reversePageScroll) { 188a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); 189a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 190a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else { 191a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); 192a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 193a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 194a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return false; 195a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 196a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }; 197a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 198a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) document.onkeyup = function(e) { 199a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var code = (window.event) ? event.keyCode : e.keyCode; 200a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (code === 16) { 201a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.reversePageScroll = false; 202a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return; 203a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 204a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }; 205a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 206a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 207a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 208a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Run any post-process modifications to article content as necessary. 209a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 210a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 211a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 212a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 213a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) postProcessContent: function(articleContent) { 214a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) { 215a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.addFootnotes(articleContent); 216a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 217a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 218a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.fixImageFloats(articleContent); 219a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 220a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 221a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 222a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Some content ends up looking ugly if the image is too large to be floated. 223a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * If the image is wider than a threshold (currently 55%), no longer float it, 224a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * center it instead. 225a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 226a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 227a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 228a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 229a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) fixImageFloats: function (articleContent) { 230a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55, 231a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) images = articleContent.getElementsByTagName('img'); 232a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 233a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var i=0, il = images.length; i < il; i+=1) { 234a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var image = images[i]; 235a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 236a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(image.offsetWidth > imageWidthThreshold) { 237a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) image.className += " blockImage"; 238a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 239a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 240a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 241a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 242a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 243a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Get the article tools Element that has buttons like reload, print. 244a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 245a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 246a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 247a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) getArticleTools: function () { 248a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var articleTools = document.createElement("DIV"); 249a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 250a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleTools.id = "readTools"; 251a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleTools.innerHTML = 252a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + 253a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + 254a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>"; 255a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 256a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return articleTools; 257a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 258a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 259a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 260a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * retuns the suggested direction of the string 261a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 262a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return "rtl" || "ltr" 263a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 264a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) getSuggestedDirection: function(text) { 265a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) function sanitizeText() { 266a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return text.replace(/@\w+/, ""); 267a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 268a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 269a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) function countMatches(match) { 270a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var matches = text.match(new RegExp(match, "g")); 271a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return matches !== null ? matches.length : 0; 272a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 273a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 274a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) function isRTL() { 275a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); 276a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); 277a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 278a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // if 20% of chars are Hebrew or Arbic then direction is rtl 279a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return (count_heb + count_arb) * 100 / text.length > 20; 280a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 281a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 282a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) text = sanitizeText(text); 283a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return isRTL() ? "rtl" : "ltr"; 284a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 285a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 286a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 287a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Get the article title as an H1. 288a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 289a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 290a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 291a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) getArticleTitle: function () { 292a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var curTitle = "", 293a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) origTitle = ""; 294a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 295a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { 296a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curTitle = origTitle = document.title; 297a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ 298a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); 299a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 300a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 301a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) catch(e) {} 302a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 303a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(curTitle.match(/ [\|\-] /)) 304a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 305a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); 306a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 307a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(curTitle.split(' ').length < 3) { 308a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); 309a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 310a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 311a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else if(curTitle.indexOf(': ') !== -1) 312a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 313a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); 314a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 315a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(curTitle.split(' ').length < 3) { 316a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); 317a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 318a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 319a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else if(curTitle.length > 150 || curTitle.length < 15) 320a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 321a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var hOnes = document.getElementsByTagName('h1'); 322a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(hOnes.length === 1) 323a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 324a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curTitle = readability.getInnerText(hOnes[0]); 325a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 326a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 327a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 328a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curTitle = curTitle.replace( readability.regexps.trim, "" ); 329a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 330a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(curTitle.split(' ').length <= 4) { 331a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curTitle = origTitle; 332a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 333a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return curTitle; 334a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 335a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 336a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 337a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Prepare the HTML document for readability to scrape it. 338a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * This includes things like stripping javascript, CSS, and handling terrible markup. 339a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 340a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 341a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 342a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) prepDocument: function () { 343a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 344a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * In some cases a body element can't be found (if the HTML is totally hosed for example) 345a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * so we create a new body node and append it to the document. 346a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) */ 347a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(document.body === null) 348a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 349a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var body = document.createElement("body"); 350a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { 351a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) document.body = body; 352a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 353a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) catch(e) { 354a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) document.documentElement.appendChild(body); 355a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg(e); 356a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 357a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 358a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 359a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) document.body.id = "readabilityBody"; 360a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 361a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var frames = document.getElementsByTagName('frame'); 362a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(frames.length > 0) 363a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 364a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var bestFrame = null; 365a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var bestFrameSize = 0; /* The frame to try to run readability upon. Must be on same domain. */ 366a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */ 367a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1) 368a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 369a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight; 370a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var canAccessFrame = false; 371a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { 372a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var frameBody = frames[frameIndex].contentWindow.document.body; 373a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) canAccessFrame = true; 374a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 375a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) catch(eFrames) { 376a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg(eFrames); 377a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 378a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 379a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(frameSize > biggestFrameSize) { 380a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) biggestFrameSize = frameSize; 381a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.biggestFrame = frames[frameIndex]; 382a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 383a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 384a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(canAccessFrame && frameSize > bestFrameSize) 385a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 386a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.frameHack = true; 387a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 388a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) bestFrame = frames[frameIndex]; 389a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) bestFrameSize = frameSize; 390a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 391a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 392a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 393a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(bestFrame) 394a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 395a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var newBody = document.createElement('body'); 396a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody); 397a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) newBody.style.overflow = 'scroll'; 398a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) document.body = newBody; 399a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 400a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var frameset = document.getElementsByTagName('frameset')[0]; 401a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(frameset) { 402a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) frameset.parentNode.removeChild(frameset); } 403a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 404a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 405a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 406a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Remove all stylesheets */ 407a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var k=0;k < document.styleSheets.length; k+=1) { 408a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) { 409a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) document.styleSheets[k].disabled = true; 410a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 411a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 412a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 413a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */ 414a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var styleTags = document.getElementsByTagName("style"); 415a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var st=0;st < styleTags.length; st+=1) { 416a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) styleTags[st].textContent = ""; 417a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 418a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 419a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Turn all double br's into p's */ 420a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ 421a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.replaceDoubleBrsWithPs(document.body); 422a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.replaceFontsWithSpans(document.body); 423a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 424a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 425a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 426a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 427a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Prepare the article node for display. Clean out any inline styles, 428a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * iframes, forms, strip extraneous <p> tags, etc. 429a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 430a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 431a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 432a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 433a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) prepArticle: function (articleContent) { 434a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.cleanStyles(articleContent); 435a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.killBreaks(articleContent); 436a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 437a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Clean out junk from the article content */ 438a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.cleanConditionally(articleContent, "form"); 439a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.clean(articleContent, "object"); 440a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.clean(articleContent, "h1"); 441a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 442a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 443a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * If there is only one h2, they are probably using it 444a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * as a header and not a subheader, so remove it since we already have a header. 445a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) ***/ 446a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(articleContent.getElementsByTagName('h2').length === 1) { 447a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.clean(articleContent, "h2"); 448a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 449a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.clean(articleContent, "iframe"); 450a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 451a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.cleanHeaders(articleContent); 452a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 453a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Do these last as the previous stuff may have removed junk that will affect these */ 454a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.cleanConditionally(articleContent, "table"); 455a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.cleanConditionally(articleContent, "ul"); 456a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.cleanConditionally(articleContent, "div"); 457a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 458a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Remove extra paragraphs */ 459a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var articleParagraphs = articleContent.getElementsByTagName('p'); 460a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var i = articleParagraphs.length-1; i >= 0; i-=1) { 461a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var imgCount = articleParagraphs[i].getElementsByTagName('img').length; 462a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; 463a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var objectCount = articleParagraphs[i].getElementsByTagName('object').length; 464a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 465a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') { 466a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); 467a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 468a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 469a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 470a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { 471a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.replaceBrsWithPs(articleContent); 472a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 473a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) catch (e) { 474a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e); 475a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 476a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 477a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 478a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 479a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Initialize a node with the readability object. Also checks the 480a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * className/id for special names to add to its score. 481a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 482a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 483a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 484a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 485a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) initializeNode: function (node) { 486a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.readability = {"contentScore": 0}; 487a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 488a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) switch(node.tagName) { 489a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'DIV': 490a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.readability.contentScore += 5; 491a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) break; 492a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 493a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'PRE': 494a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'TD': 495a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'BLOCKQUOTE': 496a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.readability.contentScore += 3; 497a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) break; 498a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 499a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'ADDRESS': 500a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'OL': 501a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'UL': 502a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'DL': 503a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'DD': 504a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'DT': 505a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'LI': 506a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'FORM': 507a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.readability.contentScore -= 3; 508a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) break; 509a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 510a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'H1': 511a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'H2': 512a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'H3': 513a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'H4': 514a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'H5': 515a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'H6': 516a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) case 'TH': 517a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.readability.contentScore -= 5; 518a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) break; 519a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 520a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 521a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.readability.contentScore += readability.getClassWeight(node); 522a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 523a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 524a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /*** 525a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 526a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 527a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 528a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param page a document to run upon. Needs to be a full document, complete with body. 529a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return Element 530a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 531a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) grabArticle: function (pageToClone) { 532a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS), 533a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) isPaging = (page !== null) ? true: false; 534a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 535a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var page = null; 536a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Never work on the actual page. 537a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (isPaging) { 538a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) page = document.body.cloneNode(true); 539a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else { 540a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) page = pageToClone.cloneNode(true); 541a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 542a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 543a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var allElements = page.getElementsByTagName('*'); 544a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 545a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 546a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs 547a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) 548a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 549a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 550a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * TODO: Shouldn't this be a reverse traversal? 551a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 552a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var node = null; 553a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var nodesToScore = []; 554a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { 555a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Remove unlikely candidates */ 556a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (stripUnlikelyCandidates) { 557a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var unlikelyMatchString = node.className + node.id; 558a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if ( 559a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) ( 560a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 && 561a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 && 562a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.tagName !== "BODY" 563a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) ) 564a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) ) 565a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 566a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Removing unlikely candidate - " + unlikelyMatchString); 567a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.parentNode.removeChild(node); 568a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nodeIndex-=1; 569a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; 570a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 571a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 572a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 573a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") { 574a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nodesToScore[nodesToScore.length] = node; 575a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 576a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 577a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Turn all divs that don't have children block level elements into p's */ 578a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (node.tagName === "DIV") { 579a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (node.innerHTML.search(readability.regexps.divToPElements) === -1) { 580a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var newNode = document.createElement('p'); 581a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { 582a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.moveNodeInnards(node, newNode); 583a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.parentNode.replaceChild(newNode, node); 584a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nodeIndex-=1; 585a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 586a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nodesToScore[nodesToScore.length] = node; 587a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 588a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) catch(e) { 589a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e); 590a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 591a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 592a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else 593a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 594a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* EXPERIMENTAL */ 595a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var i = 0, il = node.childNodes.length; i < il; i+=1) { 596a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var childNode = node.childNodes[i]; 597a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(childNode.nodeType === 3) { // Node.TEXT_NODE 598a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var p = document.createElement('p'); 599a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var t = document.createTextNode(childNode.nodeValue); 600a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) p.appendChild(t); 601a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) p.style.display = 'inline'; 602a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) p.className = 'readability-styled'; 603a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) childNode.parentNode.replaceChild(p, childNode); 604a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 605a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 606a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 607a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 608a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 609a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 610a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 611a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Loop through all paragraphs, and assign a score to them based on how content-y they look. 612a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Then add their score to their parent node. 613a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 614a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 615a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 616a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var candidates = []; 617a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var pt=0; pt < nodesToScore.length; pt+=1) { 618a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var parentNode = nodesToScore[pt].parentNode; 619a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var grandParentNode = parentNode ? parentNode.parentNode : null; 620a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var innerText = readability.getInnerText(nodesToScore[pt]); 621a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 622a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!parentNode || typeof(parentNode.tagName) === 'undefined') { 623a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; 624a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 625a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 626a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If this paragraph is less than 25 characters, don't even count it. */ 627a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(innerText.length < 25) { 628a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; } 629a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 630a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Initialize readability data for the parent. */ 631a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(typeof parentNode.readability === 'undefined') { 632a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.initializeNode(parentNode); 633a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) candidates.push(parentNode); 634a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 635a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 636a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Initialize readability data for the grandparent. */ 637a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') { 638a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.initializeNode(grandParentNode); 639a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) candidates.push(grandParentNode); 640a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 641a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 642a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var contentScore = 0; 643a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 644a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Add a point for the paragraph itself as a base. */ 645a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) contentScore+=1; 646a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 647a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Add points for any commas within this paragraph */ 648a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) contentScore += innerText.split(',').length; 649a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 650a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 651a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) contentScore += Math.min(Math.floor(innerText.length / 100), 3); 652a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 653a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Add the score to the parent. The grandparent gets half. */ 654a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) parentNode.readability.contentScore += contentScore; 655a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 656a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(grandParentNode) { 657a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) grandParentNode.readability.contentScore += contentScore/2; 658a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 659a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 660a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 661a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 662a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * After we've calculated scores, loop through all of the possible candidate nodes we found 663a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * and find the one with the highest score. 664a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 665a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var topCandidate = null; 666a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var c=0, cl=candidates.length; c < cl; c+=1) 667a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 668a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 669a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Scale the final candidates score based on link density. Good content should have a 670a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * relatively small link density (5% or less) and be mostly unaffected by this operation. 671a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 672a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c])); 673a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 674a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore); 675a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 676a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) { 677a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) topCandidate = candidates[c]; } 678a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 679a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 680a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 681a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * If we still have no top candidate, just use the body as a last resort. 682a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * We also have to copy the body node so it is something we can modify. 683a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 684a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (topCandidate === null || topCandidate.tagName === "BODY") 685a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 686a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) topCandidate = document.createElement("DIV"); 687a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.replaceNodeInnards(page, topCandidate); 688a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) page.appendChild(topCandidate); 689a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.initializeNode(topCandidate); 690a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 691a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 692a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 693a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Now that we have the top candidate, look through its siblings for content that might also be related. 694a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Things like preambles, content split by ads that we removed, etc. 695a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 696a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var articleContent = document.createElement("DIV"); 697a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (isPaging) { 698a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleContent.id = "readability-content"; 699a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 700a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); 701a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var siblingNodes = topCandidate.parentNode.childNodes; 702a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 703a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 704a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var s=0, sl=siblingNodes.length; s < sl; s+=1) { 705a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var siblingNode = siblingNodes[s]; 706a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var append = false; 707a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 708a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 709a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList. 710a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Example of error visible here: http://www.esquire.com/features/honesty0707 711a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 712a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!siblingNode) { 713a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; 714a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 715a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 716a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); 717a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); 718a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 719a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(siblingNode === topCandidate) 720a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 721a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) append = true; 722a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 723a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 724a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var contentBonus = 0; 725a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Give a bonus if sibling nodes and top candidates have the example same classname */ 726a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(siblingNode.className === topCandidate.className && topCandidate.className !== "") { 727a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) contentBonus += topCandidate.readability.contentScore * 0.2; 728a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 729a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 730a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) 731a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 732a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) append = true; 733a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 734a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 735a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(siblingNode.nodeName === "P") { 736a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var linkDensity = readability.getLinkDensity(siblingNode); 737a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var nodeContent = readability.getInnerText(siblingNode); 738a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var nodeLength = nodeContent.length; 739a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 740a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(nodeLength > 80 && linkDensity < 0.25) 741a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 742a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) append = true; 743a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 744a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) 745a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 746a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) append = true; 747a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 748a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 749a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 750a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(append) { 751a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Appending node: " + siblingNode); 752a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 753a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var nodeToAppend = null; 754a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { 755a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 756a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 757a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); 758a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nodeToAppend = document.createElement("DIV"); 759a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { 760a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nodeToAppend.id = siblingNode.id; 761a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.moveNodeInnards(siblingNode, nodeToAppend); 762a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 763a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) catch(er) { 764a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); 765a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nodeToAppend = siblingNode; 766a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) s-=1; 767a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) sl-=1; 768a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 769a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else { 770a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nodeToAppend = siblingNode; 771a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) s-=1; 772a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) sl-=1; 773a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 774a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 775a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* To ensure a node does not interfere with readability styles, remove its classnames */ 776a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nodeToAppend.className = ""; 777a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 778a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Append sibling and subtract from our list because it removes the node when you append to another node */ 779a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleContent.appendChild(nodeToAppend); 780a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 781a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 782a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 783a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 784a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * So we have all of the content that we need. Now we clean it up for presentation. 785a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 786a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.distilledArticleContent = articleContent.cloneNode(true); 787a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) //readability.prepArticle(articleContent); 788a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 789a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (readability.curPageNum === 1) { 790a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var newNode = document.createElement('div'); 791a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) newNode.id = "readability-page-1"; 792a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) newNode.setAttribute("class", "page"); 793a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.moveNodeInnards(articleContent, newNode); 794a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleContent.appendChild(newNode); 795a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 796a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 797a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 798a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Now that we've gone through the full algorithm, check to see if we got any meaningful content. 799a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 800a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * likelihood of finding the content, and the sieve approach gives us a higher likelihood of 801a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * finding the -right- content. 802a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 803a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(readability.getInnerText(articleContent, false).length < 250) { 804a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { 805a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); 806a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return readability.grabArticle(document.body); 807a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 808a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 809a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); 810a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return readability.grabArticle(document.body); 811a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 812a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { 813a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); 814a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return readability.grabArticle(document.body); 815a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else { 816a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return null; 817a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 818a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 819a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 820a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return articleContent; 821a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 822a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 823a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 824a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Removes script tags from the document. 825a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 826a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 827a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 828a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) removeScripts: function (doc) { 829a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var scripts = doc.getElementsByTagName('script'); 830a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var i = scripts.length-1; i >= 0; i-=1) 831a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 832a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) 833a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 834a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) scripts[i].nodeValue=""; 835a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) scripts[i].removeAttribute('src'); 836a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (scripts[i].parentNode) { 837a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) scripts[i].parentNode.removeChild(scripts[i]); 838a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 839a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 840a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 841a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 842a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 843a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 844a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Get the inner text of a node - cross browser compatibly. 845a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * This also strips out any excess whitespace to be found. 846a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 847a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 848a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return string 849a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 850a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) getInnerText: function (e, normalizeSpaces) { 851a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var textContent = ""; 852a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 853a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") { 854a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return ""; 855a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 856a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 857a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; 858a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 859a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (navigator.appName === "Microsoft Internet Explorer") { 860a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) textContent = e.innerText.replace( readability.regexps.trim, "" ); } 861a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else { 862a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) textContent = e.textContent.replace( readability.regexps.trim, "" ); } 863a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 864a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(normalizeSpaces) { 865a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return textContent.replace( readability.regexps.normalize, " "); } 866a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else { 867a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return textContent; } 868a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 869a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 870a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 871a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Get the number of times a string s appears in the node e. 872a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 873a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 874a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param string - what to split on. Default is "," 875a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return number (integer) 876a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 877a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) getCharCount: function (e,s) { 878a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) s = s || ","; 879a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return readability.getInnerText(e).split(s).length-1; 880a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 881a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 882a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 883a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Remove the style attribute on every e and under. 884a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * TODO: Test if getElementsByTagName(*) is faster. 885a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 886a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 887a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 888a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 889a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) cleanStyles: function (e) { 890a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) e = e || document; 891a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var cur = e.firstChild; 892a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 893a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!e) { 894a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return; } 895a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 896a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Remove any root styles, if we're able. 897a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') { 898a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) e.removeAttribute('style'); } 899a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 900a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Go until there are no more child nodes 901a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while ( cur !== null ) { 902a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if ( cur.nodeType === 1 ) { 903a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Remove style attribute(s) : 904a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(cur.className !== "readability-styled") { 905a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) cur.removeAttribute("style"); 906a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 907a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.cleanStyles( cur ); 908a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 909a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) cur = cur.nextSibling; 910a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 911a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 912a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 913a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 914a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Get the density of links as a percentage of the content 915a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * This is the amount of text that is inside a link divided by the total text in the node. 916a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 917a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 918a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return number (float) 919a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 920a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) getLinkDensity: function (e) { 921a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var links = e.getElementsByTagName("a"); 922a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var textLength = readability.getInnerText(e).length; 923a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var linkLength = 0; 924a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var i=0, il=links.length; i<il;i+=1) 925a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 926a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkLength += readability.getInnerText(links[i]).length; 927a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 928a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 929a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return linkLength / textLength; 930a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 931a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 932a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 933a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. 934a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 935a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @author Dan Lacy 936a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return string the base url 937a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 938a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) findBaseUrl: function () { 939a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var noUrlParams = window.location.pathname.split("?")[0], 940a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) urlSlashes = noUrlParams.split("/").reverse(), 941a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) cleanedSegments = [], 942a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) possibleType = ""; 943a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 944a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { 945a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var segment = urlSlashes[i]; 946a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 947a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Split off and save anything that looks like a file type. 948a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (segment.indexOf(".") !== -1) { 949a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) possibleType = segment.split(".")[1]; 950a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 951a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If the type isn't alpha-only, it's probably not actually a file extension. */ 952a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!possibleType.match(/[^a-zA-Z]/)) { 953a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) segment = segment.split(".")[0]; 954a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 955a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 956a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 957a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 958a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * EW-CMS specific segment replacement. Ugly. 959a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html 960a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 961a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(segment.indexOf(',00') !== -1) { 962a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) segment = segment.replace(',00', ''); 963a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 964a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 965a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // If our first or second segment has anything looking like a page number, remove it. 966a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { 967a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); 968a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 969a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 970a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 971a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var del = false; 972a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 973a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ 974a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (i < 2 && segment.match(/^\d{1,2}$/)) { 975a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) del = true; 976a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 977a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 978a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If this is the first segment and it's just "index", remove it. */ 979a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(i === 0 && segment.toLowerCase() === "index") { 980a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) del = true; 981a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 982a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 983a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 984a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ 985a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { 986a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) del = true; 987a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 988a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 989a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If it's not marked for deletion, push it to cleanedSegments. */ 990a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (!del) { 991a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) cleanedSegments.push(segment); 992a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 993a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 994a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 995a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // This is our final, cleaned, base article URL. 996a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/"); 997a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 998a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 999a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1000a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Look for any paging links that may occur within the document. 1001a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 1002a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param body 1003a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return object (array) 1004a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1005a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) findNextPageLink: function (elem) { 1006a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var possiblePages = {}, 1007a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) allLinks = elem.getElementsByTagName('a'), 1008a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articleBaseUrl = readability.findBaseUrl(); 1009a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1010a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1011a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Loop through all links, looking for hints that they may be next-page links. 1012a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Things like having "page" in their textContent, className or id, or being a child 1013a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * of a node with a page-y className or id. 1014a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 1015a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Also possible: levenshtein distance? longest common subsequence? 1016a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 1017a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * After we do that, assign each page a score, and 1018a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1019a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var i = 0, il = allLinks.length; i < il; i+=1) { 1020a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var link = allLinks[i], 1021a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); 1022a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1023a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If we've already seen this page, ignore it */ 1024a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) { 1025a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; 1026a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1027a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1028a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If it's on a different domain, skip it. */ 1029a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(window.location.host !== linkHref.split(/\/+/g)[1]) { 1030a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; 1031a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1032a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1033a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var linkText = readability.getInnerText(link); 1034a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1035a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If the linkText looks like it's not the next page, skip it. */ 1036a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { 1037a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; 1038a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1039a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1040a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */ 1041a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); 1042a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!linkHrefLeftover.match(/\d/)) { 1043a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; 1044a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1045a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1046a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!(linkHref in possiblePages)) { 1047a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; 1048a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else { 1049a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) possiblePages[linkHref].linkText += ' | ' + linkText; 1050a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1051a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1052a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var linkObj = possiblePages[linkHref]; 1053a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1054a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1055a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. 1056a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html 1057a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1058a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(linkHref.indexOf(articleBaseUrl) !== 0) { 1059a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score -= 25; 1060a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1061a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1062a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var linkData = linkText + ' ' + link.className + ' ' + link.id; 1063a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(linkData.match(readability.regexps.nextLink)) { 1064a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score += 50; 1065a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1066a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(linkData.match(/pag(e|ing|inat)/i)) { 1067a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score += 25; 1068a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1069a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, 1070a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ 1071a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!linkObj.linkText.match(readability.regexps.nextLink)) { 1072a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score -= 65; 1073a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1074a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1075a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) { 1076a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score -= 50; 1077a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1078a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(linkData.match(readability.regexps.prevLink)) { 1079a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score -= 200; 1080a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1081a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1082a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If a parentNode contains page or paging or paginat */ 1083a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var parentNode = link.parentNode, 1084a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) positiveNodeMatch = false, 1085a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) negativeNodeMatch = false; 1086a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while(parentNode) { 1087a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; 1088a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { 1089a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) positiveNodeMatch = true; 1090a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score += 25; 1091a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1092a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) { 1093a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */ 1094a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!parentNodeClassAndId.match(readability.regexps.positive)) { 1095a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score -= 25; 1096a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) negativeNodeMatch = true; 1097a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1098a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1099a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1100a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) parentNode = parentNode.parentNode; 1101a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1102a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1103a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1104a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * If the URL looks like it has paging in it, add to the score. 1105a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 1106a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1107a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { 1108a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score += 25; 1109a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1110a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1111a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* If the URL contains negative values, give a slight decrease. */ 1112a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (linkHref.match(readability.regexps.extraneous)) { 1113a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score -= 15; 1114a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1115a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1116a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1117a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Minor punishment to anything that doesn't match our current URL. 1118a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. 1119a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Dan, can you show me a counterexample where this is necessary? 1120a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * if (linkHref.indexOf(window.location.href) !== 0) { 1121a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * linkObj.score -= 1; 1122a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * } 1123a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1124a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1125a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1126a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * If the link text can be parsed as a number, give it a minor bonus, with a slight 1127a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * bias towards lower numbered pages. This is so that pages that might not have 'next' 1128a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * in their text can still get scored, and sorted properly by score. 1129a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1130a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var linkTextAsNumber = parseInt(linkText, 10); 1131a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(linkTextAsNumber) { 1132a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Punish 1 since we're either already there, or it's probably before what we want anyways. 1133a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (linkTextAsNumber === 1) { 1134a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score -= 10; 1135a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1136a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else { 1137a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Todo: Describe this better 1138a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) linkObj.score += Math.max(0, 10 - linkTextAsNumber); 1139a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1140a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1141a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1142a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1143a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1144a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL. 1145a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Require at least a score of 50, which is a relatively high confidence that this page is the next link. 1146a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1147a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var topPage = null; 1148a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var page in possiblePages) { 1149a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(possiblePages.hasOwnProperty(page)) { 1150a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) { 1151a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) topPage = possiblePages[page]; 1152a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1153a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1154a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1155a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1156a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(topPage) { 1157a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var nextHref = topPage.href.replace(/\/$/,''); 1158a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1159a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg('NEXT PAGE IS ' + nextHref); 1160a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.parsedPages[nextHref] = true; 1161a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return nextHref; 1162a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1163a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else { 1164a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return null; 1165a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1166a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1167a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1168a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) createLinkDiv: function(link) { 1169a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var divNode = document.createElement('div'); 1170a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var aNode = document.createElement('a'); 1171a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var tNode = document.createTextNode('View Next Page'); 1172a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) divNode.setAttribute('style', 'text-align: center'); 1173a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) aNode.setAttribute('href', link); 1174a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) aNode.appendChild(tNode); 1175a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) divNode.appendChild(aNode); 1176a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return divNode; 1177a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1178a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1179a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) xhr: function () { 1180a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { 1181a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return new XMLHttpRequest(); 1182a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1183a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else { 1184a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { } 1185a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { } 1186a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { } 1187a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1188a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1189a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return false; 1190a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1191a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1192a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) successfulRequest: function (request) { 1193a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText); 1194a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1195a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1196a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) ajax: function (url, options) { 1197a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var request = readability.xhr(); 1198a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1199a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) function respondToReadyState(readyState) { 1200a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (request.readyState === 4) { 1201a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (readability.successfulRequest(request)) { 1202a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (options.success) { options.success(request); } 1203a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1204a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else { 1205a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (options.error) { options.error(request); } 1206a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1207a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1208a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1209a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1210a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (typeof options === 'undefined') { options = {}; } 1211a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1212a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) request.onreadystatechange = respondToReadyState; 1213a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1214a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) request.open('get', url, true); 1215a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) request.setRequestHeader('Accept', 'text/html'); 1216a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1217a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { 1218a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) request.send(options.postBody); 1219a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1220a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) catch (e) { 1221a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (options.error) { options.error(); } 1222a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1223a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1224a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return request; 1225a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1226a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1227a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1228a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Make an AJAX request for each page and append it to the document. 1229a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1230a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curPageNum: 1, 1231a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1232a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) appendNextPage: function (nextPageLink) { 1233a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.curPageNum+=1; 1234a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1235a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var articlePage = document.createElement("DIV"); 1236a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articlePage.id = 'readability-page-' + readability.curPageNum; 1237a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articlePage.className = 'page'; 1238a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">§</p>'; 1239a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1240a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) document.getElementById("readability-content").appendChild(articlePage); 1241a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1242a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(readability.curPageNum > readability.maxPages) { 1243a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var linkDiv = readability.createLinkDiv(nextPageLink); 1244a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1245a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articlePage.appendChild(linkDiv); 1246a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return; 1247a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1248a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1249a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1250a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Now that we've built the article page DOM element, get the page content 1251a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * asynchronously and load the cleaned content into the div we created for it. 1252a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1253a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) (function(pageUrl, thisPage) { 1254a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.ajax(pageUrl, { 1255a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) success: function(r) { 1256a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1257a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ 1258a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var eTag = r.getResponseHeader('ETag'); 1259a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(eTag) { 1260a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(eTag in readability.pageETags) { 1261a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Exact duplicate page found via ETag. Aborting."); 1262a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articlePage.style.display = 'none'; 1263a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return; 1264a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else { 1265a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.pageETags[eTag] = 1; 1266a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1267a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1268a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1269a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. 1270a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var page = document.createElement("DIV"); 1271a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1272a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1273a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Do some preprocessing to our HTML to make it ready for appending. 1274a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. 1275a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript. 1276a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * • Turn all double br's into p's - was handled by prepDocument in the original view. 1277a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages. 1278a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1279a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var pageInnards = r.responseXML; 1280a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.removeScripts(pageInnards); 1281a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.replaceNoscriptsWithPs(pageInnards); 1282a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.replaceDoubleBrsWithPs(pageInnards); 1283a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.replaceFontsWithSpans(pageInnards); 1284a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) page.appendChild(pageInnards); 1285a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1286a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1287a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1288a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle. 1289a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1290a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.flags = 0x1 | 0x2 | 0x4; 1291a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1292a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var nextPageLink = readability.findNextPageLink(page), 1293a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) content = readability.grabArticle(page); 1294a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1295a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!content) { 1296a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("No content found in page to append. Aborting."); 1297a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return; 1298a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1299a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1300a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1301a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. 1302a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Compare it against all of the the previous document's we've gotten. If the previous 1303a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. 1304a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1305a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; 1306a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(firstP && firstP.innerHTML.length > 100) { 1307a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var i=1; i <= readability.curPageNum; i+=1) { 1308a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var rPage = document.getElementById('readability-page-' + i); 1309a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { 1310a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg('Duplicate of page ' + i + ' - skipping.'); 1311a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) articlePage.style.display = 'none'; 1312a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.parsedPages[pageUrl] = true; 1313a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return; 1314a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1315a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1316a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1317a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1318a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.removeScripts(content); 1319a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1320a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.moveNodeInnards(content, thisPage); 1321a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1322a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1323a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * After the page has rendered, post process the content. This delay is necessary because, 1324a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * in webkit at least, offsetWidth is not set in time to determine image width. We have to 1325a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * wait a little bit for reflow to finish before we can fix floating images. 1326a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1327a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) window.setTimeout( 1328a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) function() { readability.postProcessContent(thisPage); }, 1329a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 500 1330a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) ); 1331a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1332a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(nextPageLink) { 1333a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.appendNextPage(nextPageLink); 1334a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1335a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1336a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }); 1337a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }(nextPageLink, articlePage)); 1338a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1339a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1340a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1341a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Get an elements class/id weight. Uses regular expressions to tell if this 1342a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * element looks good or bad. 1343a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 1344a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 1345a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return number (Integer) 1346a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1347a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) getClassWeight: function (e) { 1348a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 1349a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return 0; 1350a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1351a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1352a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var weight = 0; 1353a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1354a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Look for a special classname */ 1355a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (typeof(e.className) === 'string' && e.className !== '') 1356a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 1357a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(e.className.search(readability.regexps.negative) !== -1) { 1358a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) weight -= 25; } 1359a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1360a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(e.className.search(readability.regexps.positive) !== -1) { 1361a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) weight += 25; } 1362a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1363a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1364a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Look for a special ID */ 1365a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (typeof(e.id) === 'string' && e.id !== '') 1366a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 1367a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(e.id.search(readability.regexps.negative) !== -1) { 1368a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) weight -= 25; } 1369a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1370a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(e.id.search(readability.regexps.positive) !== -1) { 1371a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) weight += 25; } 1372a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1373a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1374a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return weight; 1375a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1376a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1377a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) nodeIsVisible: function (node) { 1378a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none'; 1379a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1380a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1381a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1382a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Remove extraneous break tags from a node. 1383a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 1384a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 1385a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 1386a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1387a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) killBreaks: function (e) { 1388a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var allElements = e.getElementsByTagName('*'); 1389a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (i < allElements.length) { 1390a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.deleteExtraBreaks(allElements[i]); 1391a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) i++; 1392a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1393a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1394a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1395a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1396a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Clean a node of all elements of type "tag". 1397a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * (Unless it's a youtube/vimeo video. People love movies.) 1398a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 1399a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 1400a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param string tag to clean 1401a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 1402a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1403a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) clean: function (e, tag) { 1404a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var targetList = e.getElementsByTagName( tag ); 1405a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var isEmbed = (tag === 'object' || tag === 'embed'); 1406a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1407a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var y=targetList.length-1; y >= 0; y-=1) { 1408a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Allow youtube and vimeo videos through as people usually want to see those. */ 1409a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(isEmbed) { 1410a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var attributeValues = ""; 1411a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { 1412a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) attributeValues += targetList[y].attributes[i].value + '|'; 1413a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1414a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1415a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* First, check the elements attributes to see if any of them contain youtube or vimeo */ 1416a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (attributeValues.search(readability.regexps.videos) !== -1) { 1417a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; 1418a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1419a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1420a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /* Then check the elements inside this element for the same. */ 1421a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) { 1422a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; 1423a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1424a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1425a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1426a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1427a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) targetList[y].parentNode.removeChild(targetList[y]); 1428a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1429a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1430a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1431a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1432a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Clean an element of all tags of type "tag" if they look fishy. 1433a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. 1434a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 1435a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 1436a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1437a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) cleanConditionally: function (e, tag) { 1438a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1439a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { 1440a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return; 1441a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1442a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1443a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var tagsList = e.getElementsByTagName(tag); 1444a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var curTagsLength = tagsList.length; 1445a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1446a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1447a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Gather counts for other typical elements embedded within. 1448a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Traverse backwards so we can remove nodes at the same time without effecting the traversal. 1449a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 1450a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * TODO: Consider taking into account original contentScore here. 1451a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1452a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var i=curTagsLength-1; i >= 0; i-=1) { 1453a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var weight = readability.getClassWeight(tagsList[i]); 1454a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; 1455a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1456a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : '')); 1457a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1458a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(weight+contentScore < 0) 1459a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) { 1460a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) tagsList[i].parentNode.removeChild(tagsList[i]); 1461a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1462a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) else if ( readability.getCharCount(tagsList[i],',') < 10) { 1463a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1464a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * If there are not very many commas, and the number of 1465a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. 1466a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1467a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var p = tagsList[i].getElementsByTagName("p").length; 1468a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var img = tagsList[i].getElementsByTagName("img").length; 1469a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var li = tagsList[i].getElementsByTagName("li").length-100; 1470a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var input = tagsList[i].getElementsByTagName("input").length; 1471a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1472a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var embedCount = 0; 1473a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var embeds = tagsList[i].getElementsByTagName("embed"); 1474a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(var ei=0,il=embeds.length; ei < il; ei+=1) { 1475a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (embeds[ei].src.search(readability.regexps.videos) === -1) { 1476a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) embedCount+=1; 1477a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1478a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1479a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1480a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var linkDensity = readability.getLinkDensity(tagsList[i]); 1481a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var contentLength = readability.getInnerText(tagsList[i]).length; 1482a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var toRemove = false; 1483a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1484a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if ( img > p ) { 1485a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) toRemove = true; 1486a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else if(li > p && tag !== "ul" && tag !== "ol") { 1487a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) toRemove = true; 1488a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else if( input > Math.floor(p/3) ) { 1489a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) toRemove = true; 1490a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else if(contentLength < 25 && (img === 0 || img > 2) ) { 1491a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) toRemove = true; 1492a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else if(weight < 25 && linkDensity > 0.2) { 1493a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) toRemove = true; 1494a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else if(weight >= 25 && linkDensity > 0.5) { 1495a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) toRemove = true; 1496a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { 1497a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) toRemove = true; 1498a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1499a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1500a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if(toRemove) { 1501a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) tagsList[i].parentNode.removeChild(tagsList[i]); 1502a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1503a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1504a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1505a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1506a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1507a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) /** 1508a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Clean out spurious headers from an Element. Checks things like classnames and link density. 1509a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * 1510a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @param Element 1511a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * @return void 1512a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) **/ 1513a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) cleanHeaders: function (e) { 1514a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) { 1515a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var headers = e.getElementsByTagName('h' + headerIndex); 1516a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var i=headers.length-1; i >=0; i-=1) { 1517a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) { 1518a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) headers[i].parentNode.removeChild(headers[i]); 1519a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1520a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1521a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1522a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1523a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1524a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) flagIsActive: function(flag) { 1525a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return (readability.flags & flag) > 0; 1526a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1527a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1528a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) addFlag: function(flag) { 1529a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.flags = readability.flags | flag; 1530a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1531a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1532a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) removeFlag: function(flag) { 1533a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.flags = readability.flags & ~flag; 1534a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1535a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1536a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Removes the children of |src| and appends them to |dest|. 1537a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) moveNodeInnards: function(src, dest) { 1538a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) try { 1539a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (src.firstChild) { 1540a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dest.appendChild(src.removeChild(src.firstChild)); 1541a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1542a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } catch (e) {} 1543a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1544a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1545a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Returns true if the node is a whitespace text node. 1546a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) isWhitespaceNode: function(node) { 1547a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (node.nodeType == Node.TEXT_NODE) { 1548a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (node.data.trim().length == 0) { 1549a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return true; 1550a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1551a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1552a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return false; 1553a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1554a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1555a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Returns true if the node is a <BR>. 1556a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) isBrNode: function(node) { 1557a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return (node.tagName === 'BR'); 1558a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1559a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1560a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1561a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Returns the last <BR> node in a sequence of <BR> nodes that are only 1562a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // separated by whitespace, or null if there are not at least two <BR> tags 1563a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // in the sibling chain starting with |node|. Returns the second such <BR> 1564a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // node if |restrictToTwo| is true. 1565a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) isMultipleBr: function(node, restrictToTwo) { 1566a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var lastBr = null; 1567a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (!readability.isBrNode(node)) { 1568a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return lastBr; 1569a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1570a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var curr = node.nextSibling; 1571a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (curr) { 1572a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) { 1573a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) lastBr = curr; 1574a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curr = curr.nextSibling; 1575a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (restrictToTwo) { 1576a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (readability.isBrNode(lastBr)) { 1577a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return lastBr; 1578a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1579a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1580a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) continue; 1581a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1582a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) break; 1583a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1584a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return lastBr; 1585a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1586a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1587a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Removes all <BR> nodes except one and whitespace in between in a series 1588a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // of <BR> nodes. 1589a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) deleteExtraBreaks: function(node) { 1590a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var lastBr = readability.isMultipleBr(node, false); 1591a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var ret = false; 1592a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (lastBr && lastBr != node) { 1593a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var toRemove = lastBr; 1594a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) lastBr = lastBr.previousSibling; 1595a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) toRemove.parentNode.removeChild(toRemove); 1596a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) ret = true; 1597a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1598a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return ret; 1599a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1600a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1601a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a 1602a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // <P> node, and makes all next siblings of that pair children of <P>, up 1603a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // until the next pair of <BR> nodes is reached. 1604a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) replaceDoubleBrWithP: function(node) { 1605a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Check that we are starting with a BR. 1606a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var second = readability.isMultipleBr(node, true); 1607a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (!second) { 1608a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return; 1609a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1610a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Make all next siblings of the second BR into children of a P. 1611a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var p = document.createElement('p'); 1612a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var curr = second.nextSibling; 1613a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (curr) { 1614a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (readability.isMultipleBr(curr, true)) { 1615a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) break; 1616a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1617a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var next = curr.nextSibling; 1618a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) p.appendChild(curr.parentNode.removeChild(curr)); 1619a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curr = next; 1620a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1621a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var ret = curr; 1622a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1623a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Remove all nodes between the first and second BR. 1624a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curr = node.nextSibling; 1625a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (curr && curr != second) { 1626a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var next = curr.nextSibling; 1627a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curr.parentNode.removeChild(curr); 1628a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curr = next; 1629a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1630a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Remove the second BR. 1631a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) second.parentNode.removeChild(second); 1632a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Replace the first BR with the P. 1633a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.parentNode.replaceChild(p, node); 1634a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1635a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return ret; 1636a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1637a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1638a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Returns true if the NodeList contains a double <BR>. 1639a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) hasDoubleBr: function(nodeList) { 1640a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var i = 0; i < nodeList.length; nodeList++) { 1641a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (readability.isMultipleBr(nodeList[i], true)) { 1642a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return true; 1643a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1644a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1645a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return false; 1646a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1647a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1648a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Replaces double <BR> tags with <P> tags. 1649a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) replaceDoubleBrsWithPs: function(node) { 1650a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var allElements = node.getElementsByTagName('BR'); 1651a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var node = null; 1652a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (allElements && allElements.length > 0 && 1653a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.hasDoubleBr(allElements)) { 1654a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) { 1655a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var next = node; 1656a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (next = readability.replaceDoubleBrWithP(next)); 1657a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1658a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) allElements = document.body.getElementsByTagName('BR'); 1659a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1660a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1661a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1662a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1663a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Replaces a BR and the whitespace that follows it with a P. 1664a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) replaceBrWithP: function(node) { 1665a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (!readability.isBrNode(node)) { 1666a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return; 1667a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1668a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var p = document.createElement('p'); 1669a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var curr = node.nextSibling; 1670a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (curr && !isBrNode(curr)) { 1671a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var next = curr.nextSibling; 1672a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) if (readability.isWhitespaceNode(curr)) { 1673a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curr.parentNode.removeChild(curr); 1674a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } else { 1675a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) p.appendChild(curr.parentNode.removeChild(curr)); 1676a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1677a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) curr = next; 1678a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1679a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) node.parentNode.replaceChild(p, node); 1680a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return curr; 1681a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1682a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1683a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag 1684a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // children of the <P>. 1685a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) replaceBrsWithPs: function(node) { 1686a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var allElements = node.getElementsByTagName('BR'); 1687a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var node = null; 1688a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (allElements && allElements.length > 0) { 1689a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) { 1690a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var next = node; 1691a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) while (next = readability.replaceBrWithP(next)); 1692a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1693a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) allElements = document.body.getElementsByTagName('BR'); 1694a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1695a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1696a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1697a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Replaces any tag with any other tag. 1698a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) replaceTagsWithTags: function(node, srcTag, destTag) { 1699a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var allElements = node.getElementsByTagName(srcTag); 1700a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for (var i = 0; i < allElements.length; i++) { 1701a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var dest = document.createElement(destTag); 1702a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.moveNodeInnards(allElements[i], dest); 1703a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) allElements[i].parentNode.replaceChild(dest, allElements[i]); 1704a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1705a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1706a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1707a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Replaces all <noscript> tags with <p> tags. 1708a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) replaceNoscriptsWithPs: function(node) { 1709a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.replaceTagsWithTags(node, 'noscript', 'p'); 1710a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1711a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1712a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Replaces all <font> tags with <span> tags. 1713a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) replaceFontsWithSpans: function(node) { 1714a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) readability.replaceTagsWithTags(node, 'font', 'span'); 1715a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1716a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1717a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Returns a list of image URLs in the distilled article. 1718a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) getImages : function() { 1719a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var images = document.getElementsByTagName('img'); 1720a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) var result = new Array(images.length); 1721a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Number of images: " + images.length); 1722a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) for(i = 0; i < images.length; i++) { 1723a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) result[i] = images[i].src; 1724a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) dbg("Image: " + result[i]); 1725a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1726a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return result; 1727a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1728a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1729a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Returns the distilled article HTML from the page(s). 1730a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) getDistilledArticleHTML : function() { 1731a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return readability.distilledHTML; 1732a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) }, 1733a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) 1734a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) // Returns the next page of this article. 1735a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) getNextPageLink : function() { 1736a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) return readability.nextPageLink; 1737a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) } 1738a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)}; 1739