1// Copyright 2014 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// Local modifications to this file are described in the README.chromium 6// file. 7 8var dbg = (typeof console !== 'undefined') ? function(s) { 9 console.log("Readability: " + s); 10} : function() {}; 11 12/* 13 * Readability. An Arc90 Lab Experiment. 14 * Website: http://lab.arc90.com/experiments/readability 15 * Source: http://code.google.com/p/arc90labs-readability 16 * 17 * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission. 18 * 19 * Copyright (c) 2010 Arc90 Inc 20 * Readability is licensed under the Apache License, Version 2.0. 21**/ 22var readability = { 23 readStyle: "style-newspaper", 24 readSize: "size-medium", 25 readMargin: "margin-wide", 26 27 distilledHTML: '', 28 distilledArticleContent: null, 29 nextPageLink: '', 30 31 version: '1.7.1', 32 iframeLoads: 0, 33 convertLinksToFootnotes: false, 34 reversePageScroll: false, /* If they hold shift and hit space, scroll up */ 35 frameHack: false, /** 36 * The frame hack is to workaround a firefox bug where if you 37 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. 38 * So we fake a scrollbar in the wrapping div. 39 **/ 40 biggestFrame: false, 41 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ 42 43 /* constants */ 44 FLAG_STRIP_UNLIKELYS: 0x1, 45 FLAG_WEIGHT_CLASSES: 0x2, 46 FLAG_CLEAN_CONDITIONALLY: 0x4, 47 48 maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ 49 parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ 50 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ 51 52 /** 53 * All of the regular expressions in use within readability. 54 * Defined up here so we don't instantiate them repeatedly in loops. 55 **/ 56 regexps: { 57 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, 58 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, 59 positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, 60 negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, 61 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i, 62 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, 63 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, 64 replaceFonts: /<(\/?)font[^>]*>/gi, 65 trim: /^\s+|\s+$/g, 66 normalize: /\s{2,}/g, 67 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, 68 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, 69 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, 70 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. 71 prevLink: /(prev|earl|old|new|<|«)/i 72 }, 73 74 /** 75 * Runs readability. 76 * 77 * Workflow: 78 * 1. Prep the document by removing script tags, css, etc. 79 * 2. Build readability's DOM tree. 80 * 3. Grab the article content from the current dom tree. 81 * 4. Replace the current DOM tree with the new one. 82 * 5. Read peacefully. 83 * 84 * @return void 85 **/ 86 init: function() { 87 /* Before we do anything, remove all scripts that are not readability. */ 88 window.onload = window.onunload = function() {}; 89 90 readability.removeScripts(document); 91 92 /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */ 93 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; 94 95 /* Pull out any possible next page link first */ 96 readability.nextPageLink = readability.findNextPageLink(document.body); 97 98 /* We handle processing of nextPage from C++ set nextPageLink to null */ 99 var nextPageLink = null; 100 101 readability.prepDocument(); 102 103 /* Build readability's DOM tree */ 104 var overlay = document.createElement("DIV"); 105 var innerDiv = document.createElement("DIV"); 106 var articleTools = readability.getArticleTools(); 107 var articleTitleText = readability.getArticleTitle(); 108 var articleContent = readability.grabArticle(); 109 110 if(!articleContent) { 111 articleContent = document.createElement("DIV"); 112 articleContent.id = "readability-content"; 113 articleContent.innerHTML = [ 114 "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>", 115 (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""), 116 "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>" 117 ].join(''); 118 119 nextPageLink = null; 120 } 121 122 overlay.id = "readOverlay"; 123 innerDiv.id = "readInner"; 124 125 /* Apply user-selected styling */ 126 document.body.className = readability.readStyle; 127 document.dir = readability.getSuggestedDirection(articleTitleText); 128 129 if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){ 130 overlay.className = readability.readStyle + " rdbTypekit"; 131 } else { 132 overlay.className = readability.readStyle; 133 } 134 innerDiv.className = readability.readMargin + " " + readability.readSize; 135 136 if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) { 137 readability.convertLinksToFootnotes = true; 138 } 139 140 readability.distilledHTML = articleContent.innerHTML; 141 142 if(readability.frameHack) { 143 var readOverlay = document.getElementById('readOverlay'); 144 readOverlay.style.height = '100%'; 145 readOverlay.style.overflow = 'auto'; 146 } 147 148 /** 149 * If someone tries to use Readability on a site's root page, give them a warning about usage. 150 **/ 151 if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) { 152 articleContent.style.display = "none"; 153 var rootWarning = document.createElement('p'); 154 rootWarning.id = "readability-warning"; 155 rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " + 156 "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue."; 157 158 innerDiv.insertBefore( rootWarning, articleContent ); 159 } 160 161 readability.postProcessContent(articleContent); 162 163 window.scrollTo(0, 0); 164 165 if (nextPageLink) { 166 /** 167 * Append any additional pages after a small timeout so that people 168 * can start reading without having to wait for this to finish processing. 169 **/ 170 window.setTimeout(function() { 171 readability.appendNextPage(nextPageLink); 172 }, 500); 173 } 174 175 /** Smooth scrolling **/ 176 document.onkeydown = function(e) { 177 var code = (window.event) ? event.keyCode : e.keyCode; 178 if (code === 16) { 179 readability.reversePageScroll = true; 180 return; 181 } 182 183 if (code === 32) { 184 readability.curScrollStep = 0; 185 var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight); 186 187 if(readability.reversePageScroll) { 188 readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); 189 } 190 else { 191 readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); 192 } 193 194 return false; 195 } 196 }; 197 198 document.onkeyup = function(e) { 199 var code = (window.event) ? event.keyCode : e.keyCode; 200 if (code === 16) { 201 readability.reversePageScroll = false; 202 return; 203 } 204 }; 205 }, 206 207 /** 208 * Run any post-process modifications to article content as necessary. 209 * 210 * @param Element 211 * @return void 212 **/ 213 postProcessContent: function(articleContent) { 214 if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) { 215 readability.addFootnotes(articleContent); 216 } 217 218 readability.fixImageFloats(articleContent); 219 }, 220 221 /** 222 * Some content ends up looking ugly if the image is too large to be floated. 223 * If the image is wider than a threshold (currently 55%), no longer float it, 224 * center it instead. 225 * 226 * @param Element 227 * @return void 228 **/ 229 fixImageFloats: function (articleContent) { 230 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55, 231 images = articleContent.getElementsByTagName('img'); 232 233 for(var i=0, il = images.length; i < il; i+=1) { 234 var image = images[i]; 235 236 if(image.offsetWidth > imageWidthThreshold) { 237 image.className += " blockImage"; 238 } 239 } 240 }, 241 242 /** 243 * Get the article tools Element that has buttons like reload, print. 244 * 245 * @return void 246 **/ 247 getArticleTools: function () { 248 var articleTools = document.createElement("DIV"); 249 250 articleTools.id = "readTools"; 251 articleTools.innerHTML = 252 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + 253 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + 254 "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>"; 255 256 return articleTools; 257 }, 258 259 /** 260 * retuns the suggested direction of the string 261 * 262 * @return "rtl" || "ltr" 263 **/ 264 getSuggestedDirection: function(text) { 265 function sanitizeText() { 266 return text.replace(/@\w+/, ""); 267 } 268 269 function countMatches(match) { 270 var matches = text.match(new RegExp(match, "g")); 271 return matches !== null ? matches.length : 0; 272 } 273 274 function isRTL() { 275 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); 276 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); 277 278 // if 20% of chars are Hebrew or Arbic then direction is rtl 279 return (count_heb + count_arb) * 100 / text.length > 20; 280 } 281 282 text = sanitizeText(text); 283 return isRTL() ? "rtl" : "ltr"; 284 }, 285 286 /** 287 * Get the article title as an H1. 288 * 289 * @return void 290 **/ 291 getArticleTitle: function () { 292 var curTitle = "", 293 origTitle = ""; 294 295 try { 296 curTitle = origTitle = document.title; 297 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ 298 curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); 299 } 300 } 301 catch(e) {} 302 303 if(curTitle.match(/ [\|\-] /)) 304 { 305 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); 306 307 if(curTitle.split(' ').length < 3) { 308 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); 309 } 310 } 311 else if(curTitle.indexOf(': ') !== -1) 312 { 313 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); 314 315 if(curTitle.split(' ').length < 3) { 316 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); 317 } 318 } 319 else if(curTitle.length > 150 || curTitle.length < 15) 320 { 321 var hOnes = document.getElementsByTagName('h1'); 322 if(hOnes.length === 1) 323 { 324 curTitle = readability.getInnerText(hOnes[0]); 325 } 326 } 327 328 curTitle = curTitle.replace( readability.regexps.trim, "" ); 329 330 if(curTitle.split(' ').length <= 4) { 331 curTitle = origTitle; 332 } 333 return curTitle; 334 }, 335 336 /** 337 * Prepare the HTML document for readability to scrape it. 338 * This includes things like stripping javascript, CSS, and handling terrible markup. 339 * 340 * @return void 341 **/ 342 prepDocument: function () { 343 /** 344 * In some cases a body element can't be found (if the HTML is totally hosed for example) 345 * so we create a new body node and append it to the document. 346 */ 347 if(document.body === null) 348 { 349 var body = document.createElement("body"); 350 try { 351 document.body = body; 352 } 353 catch(e) { 354 document.documentElement.appendChild(body); 355 dbg(e); 356 } 357 } 358 359 document.body.id = "readabilityBody"; 360 361 var frames = document.getElementsByTagName('frame'); 362 if(frames.length > 0) 363 { 364 var bestFrame = null; 365 var bestFrameSize = 0; /* The frame to try to run readability upon. Must be on same domain. */ 366 var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */ 367 for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1) 368 { 369 var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight; 370 var canAccessFrame = false; 371 try { 372 var frameBody = frames[frameIndex].contentWindow.document.body; 373 canAccessFrame = true; 374 } 375 catch(eFrames) { 376 dbg(eFrames); 377 } 378 379 if(frameSize > biggestFrameSize) { 380 biggestFrameSize = frameSize; 381 readability.biggestFrame = frames[frameIndex]; 382 } 383 384 if(canAccessFrame && frameSize > bestFrameSize) 385 { 386 readability.frameHack = true; 387 388 bestFrame = frames[frameIndex]; 389 bestFrameSize = frameSize; 390 } 391 } 392 393 if(bestFrame) 394 { 395 var newBody = document.createElement('body'); 396 readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody); 397 newBody.style.overflow = 'scroll'; 398 document.body = newBody; 399 400 var frameset = document.getElementsByTagName('frameset')[0]; 401 if(frameset) { 402 frameset.parentNode.removeChild(frameset); } 403 } 404 } 405 406 /* Remove all stylesheets */ 407 for (var k=0;k < document.styleSheets.length; k+=1) { 408 if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) { 409 document.styleSheets[k].disabled = true; 410 } 411 } 412 413 /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */ 414 var styleTags = document.getElementsByTagName("style"); 415 for (var st=0;st < styleTags.length; st+=1) { 416 styleTags[st].textContent = ""; 417 } 418 419 /* Turn all double br's into p's */ 420 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ 421 readability.replaceDoubleBrsWithPs(document.body); 422 readability.replaceFontsWithSpans(document.body); 423 }, 424 425 426 /** 427 * Prepare the article node for display. Clean out any inline styles, 428 * iframes, forms, strip extraneous <p> tags, etc. 429 * 430 * @param Element 431 * @return void 432 **/ 433 prepArticle: function (articleContent) { 434 readability.cleanStyles(articleContent); 435 readability.killBreaks(articleContent); 436 437 /* Clean out junk from the article content */ 438 readability.cleanConditionally(articleContent, "form"); 439 readability.clean(articleContent, "object"); 440 readability.clean(articleContent, "h1"); 441 442 /** 443 * If there is only one h2, they are probably using it 444 * as a header and not a subheader, so remove it since we already have a header. 445 ***/ 446 if(articleContent.getElementsByTagName('h2').length === 1) { 447 readability.clean(articleContent, "h2"); 448 } 449 readability.clean(articleContent, "iframe"); 450 451 readability.cleanHeaders(articleContent); 452 453 /* Do these last as the previous stuff may have removed junk that will affect these */ 454 readability.cleanConditionally(articleContent, "table"); 455 readability.cleanConditionally(articleContent, "ul"); 456 readability.cleanConditionally(articleContent, "div"); 457 458 /* Remove extra paragraphs */ 459 var articleParagraphs = articleContent.getElementsByTagName('p'); 460 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { 461 var imgCount = articleParagraphs[i].getElementsByTagName('img').length; 462 var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; 463 var objectCount = articleParagraphs[i].getElementsByTagName('object').length; 464 465 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') { 466 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); 467 } 468 } 469 470 try { 471 readability.replaceBrsWithPs(articleContent); 472 } 473 catch (e) { 474 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e); 475 } 476 }, 477 478 /** 479 * Initialize a node with the readability object. Also checks the 480 * className/id for special names to add to its score. 481 * 482 * @param Element 483 * @return void 484 **/ 485 initializeNode: function (node) { 486 node.readability = {"contentScore": 0}; 487 488 switch(node.tagName) { 489 case 'DIV': 490 node.readability.contentScore += 5; 491 break; 492 493 case 'PRE': 494 case 'TD': 495 case 'BLOCKQUOTE': 496 node.readability.contentScore += 3; 497 break; 498 499 case 'ADDRESS': 500 case 'OL': 501 case 'UL': 502 case 'DL': 503 case 'DD': 504 case 'DT': 505 case 'LI': 506 case 'FORM': 507 node.readability.contentScore -= 3; 508 break; 509 510 case 'H1': 511 case 'H2': 512 case 'H3': 513 case 'H4': 514 case 'H5': 515 case 'H6': 516 case 'TH': 517 node.readability.contentScore -= 5; 518 break; 519 } 520 521 node.readability.contentScore += readability.getClassWeight(node); 522 }, 523 524 /*** 525 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 526 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 527 * 528 * @param page a document to run upon. Needs to be a full document, complete with body. 529 * @return Element 530 **/ 531 grabArticle: function (pageToClone) { 532 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS), 533 isPaging = (page !== null) ? true: false; 534 535 var page = null; 536 // Never work on the actual page. 537 if (isPaging) { 538 page = document.body.cloneNode(true); 539 } else { 540 page = pageToClone.cloneNode(true); 541 } 542 543 var allElements = page.getElementsByTagName('*'); 544 545 /** 546 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs 547 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) 548 * 549 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 550 * TODO: Shouldn't this be a reverse traversal? 551 **/ 552 var node = null; 553 var nodesToScore = []; 554 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { 555 /* Remove unlikely candidates */ 556 if (stripUnlikelyCandidates) { 557 var unlikelyMatchString = node.className + node.id; 558 if ( 559 ( 560 unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 && 561 unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 && 562 node.tagName !== "BODY" 563 ) 564 ) 565 { 566 dbg("Removing unlikely candidate - " + unlikelyMatchString); 567 node.parentNode.removeChild(node); 568 nodeIndex-=1; 569 continue; 570 } 571 } 572 573 if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") { 574 nodesToScore[nodesToScore.length] = node; 575 } 576 577 /* Turn all divs that don't have children block level elements into p's */ 578 if (node.tagName === "DIV") { 579 if (node.innerHTML.search(readability.regexps.divToPElements) === -1) { 580 var newNode = document.createElement('p'); 581 try { 582 readability.moveNodeInnards(node, newNode); 583 node.parentNode.replaceChild(newNode, node); 584 nodeIndex-=1; 585 586 nodesToScore[nodesToScore.length] = node; 587 } 588 catch(e) { 589 dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e); 590 } 591 } 592 else 593 { 594 /* EXPERIMENTAL */ 595 for(var i = 0, il = node.childNodes.length; i < il; i+=1) { 596 var childNode = node.childNodes[i]; 597 if(childNode.nodeType === 3) { // Node.TEXT_NODE 598 var p = document.createElement('p'); 599 var t = document.createTextNode(childNode.nodeValue); 600 p.appendChild(t); 601 p.style.display = 'inline'; 602 p.className = 'readability-styled'; 603 childNode.parentNode.replaceChild(p, childNode); 604 } 605 } 606 } 607 } 608 } 609 610 /** 611 * Loop through all paragraphs, and assign a score to them based on how content-y they look. 612 * Then add their score to their parent node. 613 * 614 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 615 **/ 616 var candidates = []; 617 for (var pt=0; pt < nodesToScore.length; pt+=1) { 618 var parentNode = nodesToScore[pt].parentNode; 619 var grandParentNode = parentNode ? parentNode.parentNode : null; 620 var innerText = readability.getInnerText(nodesToScore[pt]); 621 622 if(!parentNode || typeof(parentNode.tagName) === 'undefined') { 623 continue; 624 } 625 626 /* If this paragraph is less than 25 characters, don't even count it. */ 627 if(innerText.length < 25) { 628 continue; } 629 630 /* Initialize readability data for the parent. */ 631 if(typeof parentNode.readability === 'undefined') { 632 readability.initializeNode(parentNode); 633 candidates.push(parentNode); 634 } 635 636 /* Initialize readability data for the grandparent. */ 637 if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') { 638 readability.initializeNode(grandParentNode); 639 candidates.push(grandParentNode); 640 } 641 642 var contentScore = 0; 643 644 /* Add a point for the paragraph itself as a base. */ 645 contentScore+=1; 646 647 /* Add points for any commas within this paragraph */ 648 contentScore += innerText.split(',').length; 649 650 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ 651 contentScore += Math.min(Math.floor(innerText.length / 100), 3); 652 653 /* Add the score to the parent. The grandparent gets half. */ 654 parentNode.readability.contentScore += contentScore; 655 656 if(grandParentNode) { 657 grandParentNode.readability.contentScore += contentScore/2; 658 } 659 } 660 661 /** 662 * After we've calculated scores, loop through all of the possible candidate nodes we found 663 * and find the one with the highest score. 664 **/ 665 var topCandidate = null; 666 for(var c=0, cl=candidates.length; c < cl; c+=1) 667 { 668 /** 669 * Scale the final candidates score based on link density. Good content should have a 670 * relatively small link density (5% or less) and be mostly unaffected by this operation. 671 **/ 672 candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c])); 673 674 dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore); 675 676 if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) { 677 topCandidate = candidates[c]; } 678 } 679 680 /** 681 * If we still have no top candidate, just use the body as a last resort. 682 * We also have to copy the body node so it is something we can modify. 683 **/ 684 if (topCandidate === null || topCandidate.tagName === "BODY") 685 { 686 topCandidate = document.createElement("DIV"); 687 readability.replaceNodeInnards(page, topCandidate); 688 page.appendChild(topCandidate); 689 readability.initializeNode(topCandidate); 690 } 691 692 /** 693 * Now that we have the top candidate, look through its siblings for content that might also be related. 694 * Things like preambles, content split by ads that we removed, etc. 695 **/ 696 var articleContent = document.createElement("DIV"); 697 if (isPaging) { 698 articleContent.id = "readability-content"; 699 } 700 var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); 701 var siblingNodes = topCandidate.parentNode.childNodes; 702 703 704 for(var s=0, sl=siblingNodes.length; s < sl; s+=1) { 705 var siblingNode = siblingNodes[s]; 706 var append = false; 707 708 /** 709 * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList. 710 * Example of error visible here: http://www.esquire.com/features/honesty0707 711 **/ 712 if(!siblingNode) { 713 continue; 714 } 715 716 dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); 717 dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); 718 719 if(siblingNode === topCandidate) 720 { 721 append = true; 722 } 723 724 var contentBonus = 0; 725 /* Give a bonus if sibling nodes and top candidates have the example same classname */ 726 if(siblingNode.className === topCandidate.className && topCandidate.className !== "") { 727 contentBonus += topCandidate.readability.contentScore * 0.2; 728 } 729 730 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) 731 { 732 append = true; 733 } 734 735 if(siblingNode.nodeName === "P") { 736 var linkDensity = readability.getLinkDensity(siblingNode); 737 var nodeContent = readability.getInnerText(siblingNode); 738 var nodeLength = nodeContent.length; 739 740 if(nodeLength > 80 && linkDensity < 0.25) 741 { 742 append = true; 743 } 744 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) 745 { 746 append = true; 747 } 748 } 749 750 if(append) { 751 dbg("Appending node: " + siblingNode); 752 753 var nodeToAppend = null; 754 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { 755 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ 756 757 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); 758 nodeToAppend = document.createElement("DIV"); 759 try { 760 nodeToAppend.id = siblingNode.id; 761 readability.moveNodeInnards(siblingNode, nodeToAppend); 762 } 763 catch(er) { 764 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); 765 nodeToAppend = siblingNode; 766 s-=1; 767 sl-=1; 768 } 769 } else { 770 nodeToAppend = siblingNode; 771 s-=1; 772 sl-=1; 773 } 774 775 /* To ensure a node does not interfere with readability styles, remove its classnames */ 776 nodeToAppend.className = ""; 777 778 /* Append sibling and subtract from our list because it removes the node when you append to another node */ 779 articleContent.appendChild(nodeToAppend); 780 } 781 } 782 783 /** 784 * So we have all of the content that we need. Now we clean it up for presentation. 785 **/ 786 readability.distilledArticleContent = articleContent.cloneNode(true); 787 //readability.prepArticle(articleContent); 788 789 if (readability.curPageNum === 1) { 790 var newNode = document.createElement('div'); 791 newNode.id = "readability-page-1"; 792 newNode.setAttribute("class", "page"); 793 readability.moveNodeInnards(articleContent, newNode); 794 articleContent.appendChild(newNode); 795 } 796 797 /** 798 * Now that we've gone through the full algorithm, check to see if we got any meaningful content. 799 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher 800 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of 801 * finding the -right- content. 802 **/ 803 if(readability.getInnerText(articleContent, false).length < 250) { 804 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { 805 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); 806 return readability.grabArticle(document.body); 807 } 808 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 809 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); 810 return readability.grabArticle(document.body); 811 } 812 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { 813 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); 814 return readability.grabArticle(document.body); 815 } else { 816 return null; 817 } 818 } 819 820 return articleContent; 821 }, 822 823 /** 824 * Removes script tags from the document. 825 * 826 * @param Element 827 **/ 828 removeScripts: function (doc) { 829 var scripts = doc.getElementsByTagName('script'); 830 for(var i = scripts.length-1; i >= 0; i-=1) 831 { 832 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) 833 { 834 scripts[i].nodeValue=""; 835 scripts[i].removeAttribute('src'); 836 if (scripts[i].parentNode) { 837 scripts[i].parentNode.removeChild(scripts[i]); 838 } 839 } 840 } 841 }, 842 843 /** 844 * Get the inner text of a node - cross browser compatibly. 845 * This also strips out any excess whitespace to be found. 846 * 847 * @param Element 848 * @return string 849 **/ 850 getInnerText: function (e, normalizeSpaces) { 851 var textContent = ""; 852 853 if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") { 854 return ""; 855 } 856 857 normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; 858 859 if (navigator.appName === "Microsoft Internet Explorer") { 860 textContent = e.innerText.replace( readability.regexps.trim, "" ); } 861 else { 862 textContent = e.textContent.replace( readability.regexps.trim, "" ); } 863 864 if(normalizeSpaces) { 865 return textContent.replace( readability.regexps.normalize, " "); } 866 else { 867 return textContent; } 868 }, 869 870 /** 871 * Get the number of times a string s appears in the node e. 872 * 873 * @param Element 874 * @param string - what to split on. Default is "," 875 * @return number (integer) 876 **/ 877 getCharCount: function (e,s) { 878 s = s || ","; 879 return readability.getInnerText(e).split(s).length-1; 880 }, 881 882 /** 883 * Remove the style attribute on every e and under. 884 * TODO: Test if getElementsByTagName(*) is faster. 885 * 886 * @param Element 887 * @return void 888 **/ 889 cleanStyles: function (e) { 890 e = e || document; 891 var cur = e.firstChild; 892 893 if(!e) { 894 return; } 895 896 // Remove any root styles, if we're able. 897 if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') { 898 e.removeAttribute('style'); } 899 900 // Go until there are no more child nodes 901 while ( cur !== null ) { 902 if ( cur.nodeType === 1 ) { 903 // Remove style attribute(s) : 904 if(cur.className !== "readability-styled") { 905 cur.removeAttribute("style"); 906 } 907 readability.cleanStyles( cur ); 908 } 909 cur = cur.nextSibling; 910 } 911 }, 912 913 /** 914 * Get the density of links as a percentage of the content 915 * This is the amount of text that is inside a link divided by the total text in the node. 916 * 917 * @param Element 918 * @return number (float) 919 **/ 920 getLinkDensity: function (e) { 921 var links = e.getElementsByTagName("a"); 922 var textLength = readability.getInnerText(e).length; 923 var linkLength = 0; 924 for(var i=0, il=links.length; i<il;i+=1) 925 { 926 linkLength += readability.getInnerText(links[i]).length; 927 } 928 929 return linkLength / textLength; 930 }, 931 932 /** 933 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. 934 * 935 * @author Dan Lacy 936 * @return string the base url 937 **/ 938 findBaseUrl: function () { 939 var noUrlParams = window.location.pathname.split("?")[0], 940 urlSlashes = noUrlParams.split("/").reverse(), 941 cleanedSegments = [], 942 possibleType = ""; 943 944 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { 945 var segment = urlSlashes[i]; 946 947 // Split off and save anything that looks like a file type. 948 if (segment.indexOf(".") !== -1) { 949 possibleType = segment.split(".")[1]; 950 951 /* If the type isn't alpha-only, it's probably not actually a file extension. */ 952 if(!possibleType.match(/[^a-zA-Z]/)) { 953 segment = segment.split(".")[0]; 954 } 955 } 956 957 /** 958 * EW-CMS specific segment replacement. Ugly. 959 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html 960 **/ 961 if(segment.indexOf(',00') !== -1) { 962 segment = segment.replace(',00', ''); 963 } 964 965 // If our first or second segment has anything looking like a page number, remove it. 966 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { 967 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); 968 } 969 970 971 var del = false; 972 973 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ 974 if (i < 2 && segment.match(/^\d{1,2}$/)) { 975 del = true; 976 } 977 978 /* If this is the first segment and it's just "index", remove it. */ 979 if(i === 0 && segment.toLowerCase() === "index") { 980 del = true; 981 } 982 983 984 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ 985 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { 986 del = true; 987 } 988 989 /* If it's not marked for deletion, push it to cleanedSegments. */ 990 if (!del) { 991 cleanedSegments.push(segment); 992 } 993 } 994 995 // This is our final, cleaned, base article URL. 996 return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/"); 997 }, 998 999 /** 1000 * Look for any paging links that may occur within the document. 1001 * 1002 * @param body 1003 * @return object (array) 1004 **/ 1005 findNextPageLink: function (elem) { 1006 var possiblePages = {}, 1007 allLinks = elem.getElementsByTagName('a'), 1008 articleBaseUrl = readability.findBaseUrl(); 1009 1010 /** 1011 * Loop through all links, looking for hints that they may be next-page links. 1012 * Things like having "page" in their textContent, className or id, or being a child 1013 * of a node with a page-y className or id. 1014 * 1015 * Also possible: levenshtein distance? longest common subsequence? 1016 * 1017 * After we do that, assign each page a score, and 1018 **/ 1019 for(var i = 0, il = allLinks.length; i < il; i+=1) { 1020 var link = allLinks[i], 1021 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); 1022 1023 /* If we've already seen this page, ignore it */ 1024 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) { 1025 continue; 1026 } 1027 1028 /* If it's on a different domain, skip it. */ 1029 if(window.location.host !== linkHref.split(/\/+/g)[1]) { 1030 continue; 1031 } 1032 1033 var linkText = readability.getInnerText(link); 1034 1035 /* If the linkText looks like it's not the next page, skip it. */ 1036 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { 1037 continue; 1038 } 1039 1040 /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */ 1041 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); 1042 if(!linkHrefLeftover.match(/\d/)) { 1043 continue; 1044 } 1045 1046 if(!(linkHref in possiblePages)) { 1047 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; 1048 } else { 1049 possiblePages[linkHref].linkText += ' | ' + linkText; 1050 } 1051 1052 var linkObj = possiblePages[linkHref]; 1053 1054 /** 1055 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. 1056 * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html 1057 **/ 1058 if(linkHref.indexOf(articleBaseUrl) !== 0) { 1059 linkObj.score -= 25; 1060 } 1061 1062 var linkData = linkText + ' ' + link.className + ' ' + link.id; 1063 if(linkData.match(readability.regexps.nextLink)) { 1064 linkObj.score += 50; 1065 } 1066 if(linkData.match(/pag(e|ing|inat)/i)) { 1067 linkObj.score += 25; 1068 } 1069 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, 1070 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ 1071 if(!linkObj.linkText.match(readability.regexps.nextLink)) { 1072 linkObj.score -= 65; 1073 } 1074 } 1075 if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) { 1076 linkObj.score -= 50; 1077 } 1078 if(linkData.match(readability.regexps.prevLink)) { 1079 linkObj.score -= 200; 1080 } 1081 1082 /* If a parentNode contains page or paging or paginat */ 1083 var parentNode = link.parentNode, 1084 positiveNodeMatch = false, 1085 negativeNodeMatch = false; 1086 while(parentNode) { 1087 var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; 1088 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { 1089 positiveNodeMatch = true; 1090 linkObj.score += 25; 1091 } 1092 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) { 1093 /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */ 1094 if(!parentNodeClassAndId.match(readability.regexps.positive)) { 1095 linkObj.score -= 25; 1096 negativeNodeMatch = true; 1097 } 1098 } 1099 1100 parentNode = parentNode.parentNode; 1101 } 1102 1103 /** 1104 * If the URL looks like it has paging in it, add to the score. 1105 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 1106 **/ 1107 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { 1108 linkObj.score += 25; 1109 } 1110 1111 /* If the URL contains negative values, give a slight decrease. */ 1112 if (linkHref.match(readability.regexps.extraneous)) { 1113 linkObj.score -= 15; 1114 } 1115 1116 /** 1117 * Minor punishment to anything that doesn't match our current URL. 1118 * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. 1119 * Dan, can you show me a counterexample where this is necessary? 1120 * if (linkHref.indexOf(window.location.href) !== 0) { 1121 * linkObj.score -= 1; 1122 * } 1123 **/ 1124 1125 /** 1126 * If the link text can be parsed as a number, give it a minor bonus, with a slight 1127 * bias towards lower numbered pages. This is so that pages that might not have 'next' 1128 * in their text can still get scored, and sorted properly by score. 1129 **/ 1130 var linkTextAsNumber = parseInt(linkText, 10); 1131 if(linkTextAsNumber) { 1132 // Punish 1 since we're either already there, or it's probably before what we want anyways. 1133 if (linkTextAsNumber === 1) { 1134 linkObj.score -= 10; 1135 } 1136 else { 1137 // Todo: Describe this better 1138 linkObj.score += Math.max(0, 10 - linkTextAsNumber); 1139 } 1140 } 1141 } 1142 1143 /** 1144 * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL. 1145 * Require at least a score of 50, which is a relatively high confidence that this page is the next link. 1146 **/ 1147 var topPage = null; 1148 for(var page in possiblePages) { 1149 if(possiblePages.hasOwnProperty(page)) { 1150 if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) { 1151 topPage = possiblePages[page]; 1152 } 1153 } 1154 } 1155 1156 if(topPage) { 1157 var nextHref = topPage.href.replace(/\/$/,''); 1158 1159 dbg('NEXT PAGE IS ' + nextHref); 1160 readability.parsedPages[nextHref] = true; 1161 return nextHref; 1162 } 1163 else { 1164 return null; 1165 } 1166 }, 1167 1168 createLinkDiv: function(link) { 1169 var divNode = document.createElement('div'); 1170 var aNode = document.createElement('a'); 1171 var tNode = document.createTextNode('View Next Page'); 1172 divNode.setAttribute('style', 'text-align: center'); 1173 aNode.setAttribute('href', link); 1174 aNode.appendChild(tNode); 1175 divNode.appendChild(aNode); 1176 return divNode; 1177 }, 1178 1179 xhr: function () { 1180 if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { 1181 return new XMLHttpRequest(); 1182 } 1183 else { 1184 try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { } 1185 try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { } 1186 try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { } 1187 } 1188 1189 return false; 1190 }, 1191 1192 successfulRequest: function (request) { 1193 return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText); 1194 }, 1195 1196 ajax: function (url, options) { 1197 var request = readability.xhr(); 1198 1199 function respondToReadyState(readyState) { 1200 if (request.readyState === 4) { 1201 if (readability.successfulRequest(request)) { 1202 if (options.success) { options.success(request); } 1203 } 1204 else { 1205 if (options.error) { options.error(request); } 1206 } 1207 } 1208 } 1209 1210 if (typeof options === 'undefined') { options = {}; } 1211 1212 request.onreadystatechange = respondToReadyState; 1213 1214 request.open('get', url, true); 1215 request.setRequestHeader('Accept', 'text/html'); 1216 1217 try { 1218 request.send(options.postBody); 1219 } 1220 catch (e) { 1221 if (options.error) { options.error(); } 1222 } 1223 1224 return request; 1225 }, 1226 1227 /** 1228 * Make an AJAX request for each page and append it to the document. 1229 **/ 1230 curPageNum: 1, 1231 1232 appendNextPage: function (nextPageLink) { 1233 readability.curPageNum+=1; 1234 1235 var articlePage = document.createElement("DIV"); 1236 articlePage.id = 'readability-page-' + readability.curPageNum; 1237 articlePage.className = 'page'; 1238 articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">§</p>'; 1239 1240 document.getElementById("readability-content").appendChild(articlePage); 1241 1242 if(readability.curPageNum > readability.maxPages) { 1243 var linkDiv = readability.createLinkDiv(nextPageLink); 1244 1245 articlePage.appendChild(linkDiv); 1246 return; 1247 } 1248 1249 /** 1250 * Now that we've built the article page DOM element, get the page content 1251 * asynchronously and load the cleaned content into the div we created for it. 1252 **/ 1253 (function(pageUrl, thisPage) { 1254 readability.ajax(pageUrl, { 1255 success: function(r) { 1256 1257 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ 1258 var eTag = r.getResponseHeader('ETag'); 1259 if(eTag) { 1260 if(eTag in readability.pageETags) { 1261 dbg("Exact duplicate page found via ETag. Aborting."); 1262 articlePage.style.display = 'none'; 1263 return; 1264 } else { 1265 readability.pageETags[eTag] = 1; 1266 } 1267 } 1268 1269 // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. 1270 var page = document.createElement("DIV"); 1271 1272 /** 1273 * Do some preprocessing to our HTML to make it ready for appending. 1274 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. 1275 * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript. 1276 * • Turn all double br's into p's - was handled by prepDocument in the original view. 1277 * Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages. 1278 **/ 1279 var pageInnards = r.responseXML; 1280 readability.removeScripts(pageInnards); 1281 readability.replaceNoscriptsWithPs(pageInnards); 1282 readability.replaceDoubleBrsWithPs(pageInnards); 1283 readability.replaceFontsWithSpans(pageInnards); 1284 page.appendChild(pageInnards); 1285 1286 1287 /** 1288 * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle. 1289 **/ 1290 readability.flags = 0x1 | 0x2 | 0x4; 1291 1292 var nextPageLink = readability.findNextPageLink(page), 1293 content = readability.grabArticle(page); 1294 1295 if(!content) { 1296 dbg("No content found in page to append. Aborting."); 1297 return; 1298 } 1299 1300 /** 1301 * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. 1302 * Compare it against all of the the previous document's we've gotten. If the previous 1303 * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. 1304 **/ 1305 var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; 1306 if(firstP && firstP.innerHTML.length > 100) { 1307 for(var i=1; i <= readability.curPageNum; i+=1) { 1308 var rPage = document.getElementById('readability-page-' + i); 1309 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { 1310 dbg('Duplicate of page ' + i + ' - skipping.'); 1311 articlePage.style.display = 'none'; 1312 readability.parsedPages[pageUrl] = true; 1313 return; 1314 } 1315 } 1316 } 1317 1318 readability.removeScripts(content); 1319 1320 readability.moveNodeInnards(content, thisPage); 1321 1322 /** 1323 * After the page has rendered, post process the content. This delay is necessary because, 1324 * in webkit at least, offsetWidth is not set in time to determine image width. We have to 1325 * wait a little bit for reflow to finish before we can fix floating images. 1326 **/ 1327 window.setTimeout( 1328 function() { readability.postProcessContent(thisPage); }, 1329 500 1330 ); 1331 1332 if(nextPageLink) { 1333 readability.appendNextPage(nextPageLink); 1334 } 1335 } 1336 }); 1337 }(nextPageLink, articlePage)); 1338 }, 1339 1340 /** 1341 * Get an elements class/id weight. Uses regular expressions to tell if this 1342 * element looks good or bad. 1343 * 1344 * @param Element 1345 * @return number (Integer) 1346 **/ 1347 getClassWeight: function (e) { 1348 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { 1349 return 0; 1350 } 1351 1352 var weight = 0; 1353 1354 /* Look for a special classname */ 1355 if (typeof(e.className) === 'string' && e.className !== '') 1356 { 1357 if(e.className.search(readability.regexps.negative) !== -1) { 1358 weight -= 25; } 1359 1360 if(e.className.search(readability.regexps.positive) !== -1) { 1361 weight += 25; } 1362 } 1363 1364 /* Look for a special ID */ 1365 if (typeof(e.id) === 'string' && e.id !== '') 1366 { 1367 if(e.id.search(readability.regexps.negative) !== -1) { 1368 weight -= 25; } 1369 1370 if(e.id.search(readability.regexps.positive) !== -1) { 1371 weight += 25; } 1372 } 1373 1374 return weight; 1375 }, 1376 1377 nodeIsVisible: function (node) { 1378 return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none'; 1379 }, 1380 1381 /** 1382 * Remove extraneous break tags from a node. 1383 * 1384 * @param Element 1385 * @return void 1386 **/ 1387 killBreaks: function (e) { 1388 var allElements = e.getElementsByTagName('*'); 1389 while (i < allElements.length) { 1390 readability.deleteExtraBreaks(allElements[i]); 1391 i++; 1392 } 1393 }, 1394 1395 /** 1396 * Clean a node of all elements of type "tag". 1397 * (Unless it's a youtube/vimeo video. People love movies.) 1398 * 1399 * @param Element 1400 * @param string tag to clean 1401 * @return void 1402 **/ 1403 clean: function (e, tag) { 1404 var targetList = e.getElementsByTagName( tag ); 1405 var isEmbed = (tag === 'object' || tag === 'embed'); 1406 1407 for (var y=targetList.length-1; y >= 0; y-=1) { 1408 /* Allow youtube and vimeo videos through as people usually want to see those. */ 1409 if(isEmbed) { 1410 var attributeValues = ""; 1411 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { 1412 attributeValues += targetList[y].attributes[i].value + '|'; 1413 } 1414 1415 /* First, check the elements attributes to see if any of them contain youtube or vimeo */ 1416 if (attributeValues.search(readability.regexps.videos) !== -1) { 1417 continue; 1418 } 1419 1420 /* Then check the elements inside this element for the same. */ 1421 if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) { 1422 continue; 1423 } 1424 1425 } 1426 1427 targetList[y].parentNode.removeChild(targetList[y]); 1428 } 1429 }, 1430 1431 /** 1432 * Clean an element of all tags of type "tag" if they look fishy. 1433 * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. 1434 * 1435 * @return void 1436 **/ 1437 cleanConditionally: function (e, tag) { 1438 1439 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { 1440 return; 1441 } 1442 1443 var tagsList = e.getElementsByTagName(tag); 1444 var curTagsLength = tagsList.length; 1445 1446 /** 1447 * Gather counts for other typical elements embedded within. 1448 * Traverse backwards so we can remove nodes at the same time without effecting the traversal. 1449 * 1450 * TODO: Consider taking into account original contentScore here. 1451 **/ 1452 for (var i=curTagsLength-1; i >= 0; i-=1) { 1453 var weight = readability.getClassWeight(tagsList[i]); 1454 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; 1455 1456 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : '')); 1457 1458 if(weight+contentScore < 0) 1459 { 1460 tagsList[i].parentNode.removeChild(tagsList[i]); 1461 } 1462 else if ( readability.getCharCount(tagsList[i],',') < 10) { 1463 /** 1464 * If there are not very many commas, and the number of 1465 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. 1466 **/ 1467 var p = tagsList[i].getElementsByTagName("p").length; 1468 var img = tagsList[i].getElementsByTagName("img").length; 1469 var li = tagsList[i].getElementsByTagName("li").length-100; 1470 var input = tagsList[i].getElementsByTagName("input").length; 1471 1472 var embedCount = 0; 1473 var embeds = tagsList[i].getElementsByTagName("embed"); 1474 for(var ei=0,il=embeds.length; ei < il; ei+=1) { 1475 if (embeds[ei].src.search(readability.regexps.videos) === -1) { 1476 embedCount+=1; 1477 } 1478 } 1479 1480 var linkDensity = readability.getLinkDensity(tagsList[i]); 1481 var contentLength = readability.getInnerText(tagsList[i]).length; 1482 var toRemove = false; 1483 1484 if ( img > p ) { 1485 toRemove = true; 1486 } else if(li > p && tag !== "ul" && tag !== "ol") { 1487 toRemove = true; 1488 } else if( input > Math.floor(p/3) ) { 1489 toRemove = true; 1490 } else if(contentLength < 25 && (img === 0 || img > 2) ) { 1491 toRemove = true; 1492 } else if(weight < 25 && linkDensity > 0.2) { 1493 toRemove = true; 1494 } else if(weight >= 25 && linkDensity > 0.5) { 1495 toRemove = true; 1496 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { 1497 toRemove = true; 1498 } 1499 1500 if(toRemove) { 1501 tagsList[i].parentNode.removeChild(tagsList[i]); 1502 } 1503 } 1504 } 1505 }, 1506 1507 /** 1508 * Clean out spurious headers from an Element. Checks things like classnames and link density. 1509 * 1510 * @param Element 1511 * @return void 1512 **/ 1513 cleanHeaders: function (e) { 1514 for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) { 1515 var headers = e.getElementsByTagName('h' + headerIndex); 1516 for (var i=headers.length-1; i >=0; i-=1) { 1517 if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) { 1518 headers[i].parentNode.removeChild(headers[i]); 1519 } 1520 } 1521 } 1522 }, 1523 1524 flagIsActive: function(flag) { 1525 return (readability.flags & flag) > 0; 1526 }, 1527 1528 addFlag: function(flag) { 1529 readability.flags = readability.flags | flag; 1530 }, 1531 1532 removeFlag: function(flag) { 1533 readability.flags = readability.flags & ~flag; 1534 }, 1535 1536 // Removes the children of |src| and appends them to |dest|. 1537 moveNodeInnards: function(src, dest) { 1538 try { 1539 while (src.firstChild) { 1540 dest.appendChild(src.removeChild(src.firstChild)); 1541 } 1542 } catch (e) {} 1543 }, 1544 1545 // Returns true if the node is a whitespace text node. 1546 isWhitespaceNode: function(node) { 1547 if (node.nodeType == Node.TEXT_NODE) { 1548 if (node.data.trim().length == 0) { 1549 return true; 1550 } 1551 } 1552 return false; 1553 }, 1554 1555 // Returns true if the node is a <BR>. 1556 isBrNode: function(node) { 1557 return (node.tagName === 'BR'); 1558 }, 1559 1560 1561 // Returns the last <BR> node in a sequence of <BR> nodes that are only 1562 // separated by whitespace, or null if there are not at least two <BR> tags 1563 // in the sibling chain starting with |node|. Returns the second such <BR> 1564 // node if |restrictToTwo| is true. 1565 isMultipleBr: function(node, restrictToTwo) { 1566 var lastBr = null; 1567 if (!readability.isBrNode(node)) { 1568 return lastBr; 1569 } 1570 var curr = node.nextSibling; 1571 while (curr) { 1572 if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) { 1573 lastBr = curr; 1574 curr = curr.nextSibling; 1575 if (restrictToTwo) { 1576 if (readability.isBrNode(lastBr)) { 1577 return lastBr; 1578 } 1579 } 1580 continue; 1581 } 1582 break; 1583 } 1584 return lastBr; 1585 }, 1586 1587 // Removes all <BR> nodes except one and whitespace in between in a series 1588 // of <BR> nodes. 1589 deleteExtraBreaks: function(node) { 1590 var lastBr = readability.isMultipleBr(node, false); 1591 var ret = false; 1592 while (lastBr && lastBr != node) { 1593 var toRemove = lastBr; 1594 lastBr = lastBr.previousSibling; 1595 toRemove.parentNode.removeChild(toRemove); 1596 ret = true; 1597 } 1598 return ret; 1599 }, 1600 1601 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a 1602 // <P> node, and makes all next siblings of that pair children of <P>, up 1603 // until the next pair of <BR> nodes is reached. 1604 replaceDoubleBrWithP: function(node) { 1605 // Check that we are starting with a BR. 1606 var second = readability.isMultipleBr(node, true); 1607 if (!second) { 1608 return; 1609 } 1610 // Make all next siblings of the second BR into children of a P. 1611 var p = document.createElement('p'); 1612 var curr = second.nextSibling; 1613 while (curr) { 1614 if (readability.isMultipleBr(curr, true)) { 1615 break; 1616 } 1617 var next = curr.nextSibling; 1618 p.appendChild(curr.parentNode.removeChild(curr)); 1619 curr = next; 1620 } 1621 var ret = curr; 1622 1623 // Remove all nodes between the first and second BR. 1624 curr = node.nextSibling; 1625 while (curr && curr != second) { 1626 var next = curr.nextSibling; 1627 curr.parentNode.removeChild(curr); 1628 curr = next; 1629 } 1630 // Remove the second BR. 1631 second.parentNode.removeChild(second); 1632 // Replace the first BR with the P. 1633 node.parentNode.replaceChild(p, node); 1634 1635 return ret; 1636 }, 1637 1638 // Returns true if the NodeList contains a double <BR>. 1639 hasDoubleBr: function(nodeList) { 1640 for (var i = 0; i < nodeList.length; nodeList++) { 1641 if (readability.isMultipleBr(nodeList[i], true)) { 1642 return true; 1643 } 1644 } 1645 return false; 1646 }, 1647 1648 // Replaces double <BR> tags with <P> tags. 1649 replaceDoubleBrsWithPs: function(node) { 1650 var allElements = node.getElementsByTagName('BR'); 1651 var node = null; 1652 while (allElements && allElements.length > 0 && 1653 readability.hasDoubleBr(allElements)) { 1654 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) { 1655 var next = node; 1656 while (next = readability.replaceDoubleBrWithP(next)); 1657 } 1658 allElements = document.body.getElementsByTagName('BR'); 1659 } 1660 }, 1661 1662 1663 // Replaces a BR and the whitespace that follows it with a P. 1664 replaceBrWithP: function(node) { 1665 if (!readability.isBrNode(node)) { 1666 return; 1667 } 1668 var p = document.createElement('p'); 1669 var curr = node.nextSibling; 1670 while (curr && !isBrNode(curr)) { 1671 var next = curr.nextSibling; 1672 if (readability.isWhitespaceNode(curr)) { 1673 curr.parentNode.removeChild(curr); 1674 } else { 1675 p.appendChild(curr.parentNode.removeChild(curr)); 1676 } 1677 curr = next; 1678 } 1679 node.parentNode.replaceChild(p, node); 1680 return curr; 1681 }, 1682 1683 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag 1684 // children of the <P>. 1685 replaceBrsWithPs: function(node) { 1686 var allElements = node.getElementsByTagName('BR'); 1687 var node = null; 1688 while (allElements && allElements.length > 0) { 1689 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) { 1690 var next = node; 1691 while (next = readability.replaceBrWithP(next)); 1692 } 1693 allElements = document.body.getElementsByTagName('BR'); 1694 } 1695 }, 1696 1697 // Replaces any tag with any other tag. 1698 replaceTagsWithTags: function(node, srcTag, destTag) { 1699 var allElements = node.getElementsByTagName(srcTag); 1700 for (var i = 0; i < allElements.length; i++) { 1701 var dest = document.createElement(destTag); 1702 readability.moveNodeInnards(allElements[i], dest); 1703 allElements[i].parentNode.replaceChild(dest, allElements[i]); 1704 } 1705 }, 1706 1707 // Replaces all <noscript> tags with <p> tags. 1708 replaceNoscriptsWithPs: function(node) { 1709 readability.replaceTagsWithTags(node, 'noscript', 'p'); 1710 }, 1711 1712 // Replaces all <font> tags with <span> tags. 1713 replaceFontsWithSpans: function(node) { 1714 readability.replaceTagsWithTags(node, 'font', 'span'); 1715 }, 1716 1717 // Returns a list of image URLs in the distilled article. 1718 getImages : function() { 1719 var images = document.getElementsByTagName('img'); 1720 var result = new Array(images.length); 1721 dbg("Number of images: " + images.length); 1722 for(i = 0; i < images.length; i++) { 1723 result[i] = images[i].src; 1724 dbg("Image: " + result[i]); 1725 } 1726 return result; 1727 }, 1728 1729 // Returns the distilled article HTML from the page(s). 1730 getDistilledArticleHTML : function() { 1731 return readability.distilledHTML; 1732 }, 1733 1734 // Returns the next page of this article. 1735 getNextPageLink : function() { 1736 return readability.nextPageLink; 1737 } 1738}; 1739