1a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)// Copyright 2014 The Chromium Authors. All rights reserved.
2a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
3a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)// found in the LICENSE file.
4a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
5a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)// Local modifications to this file are described in the README.chromium
6a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)// file.
7a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
8a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)var dbg = (typeof console !== 'undefined') ? function(s) {
9a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    console.log("Readability: " + s);
10a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)} : function() {};
11a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
12a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)/*
13a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Readability. An Arc90 Lab Experiment.
14a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Website: http://lab.arc90.com/experiments/readability
15a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Source:  http://code.google.com/p/arc90labs-readability
16a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) *
17a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
18a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) *
19a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Copyright (c) 2010 Arc90 Inc
20a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) * Readability is licensed under the Apache License, Version 2.0.
21a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)**/
22a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)var readability = {
23a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    readStyle: "style-newspaper",
24a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    readSize: "size-medium",
25a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    readMargin: "margin-wide",
26a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
27a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    distilledHTML: '',
28a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    distilledArticleContent: null,
29a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    nextPageLink: '',
30a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
31a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    version:                '1.7.1',
32a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    iframeLoads:             0,
33a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    convertLinksToFootnotes: false,
34a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    reversePageScroll:       false, /* If they hold shift and hit space, scroll up */
35a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    frameHack:               false, /**
36a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                                      * The frame hack is to workaround a firefox bug where if you
37a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                                      * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
38a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                                      * So we fake a scrollbar in the wrapping div.
39a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                                     **/
40a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    biggestFrame:            false,
41a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    flags:                   0x1 | 0x2 | 0x4,   /* Start with all flags set. */
42a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
43a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /* constants */
44a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    FLAG_STRIP_UNLIKELYS:     0x1,
45a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    FLAG_WEIGHT_CLASSES:      0x2,
46a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    FLAG_CLEAN_CONDITIONALLY: 0x4,
47a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
48a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    maxPages:    30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
49a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
50a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    pageETags:   {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
51a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
52a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
53a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * All of the regular expressions in use within readability.
54a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Defined up here so we don't instantiate them repeatedly in loops.
55a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     **/
56a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    regexps: {
57a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        unlikelyCandidates:    /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
58a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        okMaybeItsACandidate:  /and|article|body|column|main|shadow/i,
59a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        positive:              /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
60a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        negative:              /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
61a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        extraneous:            /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
62a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        divToPElements:        /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
63a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        replaceBrs:            /(<br[^>]*>[ \n\r\t]*){2,}/gi,
64a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        replaceFonts:          /<(\/?)font[^>]*>/gi,
65a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        trim:                  /^\s+|\s+$/g,
66a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        normalize:             /\s{2,}/g,
67a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        killBreaks:            /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
68a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        videos:                /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
69a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
70a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        nextLink:              /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
71a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        prevLink:              /(prev|earl|old|new|<|«)/i
72a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
73a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
74a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
75a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Runs readability.
76a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
77a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Workflow:
78a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *  1. Prep the document by removing script tags, css, etc.
79a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *  2. Build readability's DOM tree.
80a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *  3. Grab the article content from the current dom tree.
81a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *  4. Replace the current DOM tree with the new one.
82a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *  5. Read peacefully.
83a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
84a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
85a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     **/
86a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    init: function() {
87a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Before we do anything, remove all scripts that are not readability. */
88a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        window.onload = window.onunload = function() {};
89a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
90a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.removeScripts(document);
91a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
92a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
93a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
94a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
95a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Pull out any possible next page link first */
96a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.nextPageLink = readability.findNextPageLink(document.body);
97a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
98a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* We handle processing of nextPage from C++ set nextPageLink to null */
99a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var nextPageLink = null;
100a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
101a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.prepDocument();
102a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
103a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Build readability's DOM tree */
104a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var overlay        = document.createElement("DIV");
105a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var innerDiv       = document.createElement("DIV");
106a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var articleTools   = readability.getArticleTools();
107a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var articleTitleText   = readability.getArticleTitle();
108a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var articleContent = readability.grabArticle();
109a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
110a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(!articleContent) {
111a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            articleContent    = document.createElement("DIV");
112a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            articleContent.id = "readability-content";
113a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            articleContent.innerHTML = [
114a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>",
115a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""),
116a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>"
117a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            ].join('');
118a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
119a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            nextPageLink = null;
120a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
121a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
122a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        overlay.id              = "readOverlay";
123a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        innerDiv.id             = "readInner";
124a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
125a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Apply user-selected styling */
126a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        document.body.className = readability.readStyle;
127a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        document.dir            = readability.getSuggestedDirection(articleTitleText);
128a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
129a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){
130a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            overlay.className = readability.readStyle + " rdbTypekit";
131a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        } else {
132a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            overlay.className = readability.readStyle;
133a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
134a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        innerDiv.className    = readability.readMargin + " " + readability.readSize;
135a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
136a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) {
137a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.convertLinksToFootnotes = true;
138a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
139a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
140a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.distilledHTML = articleContent.innerHTML;
141a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
142a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(readability.frameHack) {
143a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var readOverlay = document.getElementById('readOverlay');
144a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readOverlay.style.height = '100%';
145a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readOverlay.style.overflow = 'auto';
146a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
147a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
148a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
149a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * If someone tries to use Readability on a site's root page, give them a warning about usage.
150a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
151a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) {
152a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            articleContent.style.display = "none";
153a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var rootWarning = document.createElement('p');
154a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                rootWarning.id = "readability-warning";
155a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
156a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
157a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
158a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            innerDiv.insertBefore( rootWarning, articleContent );
159a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
160a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
161a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.postProcessContent(articleContent);
162a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
163a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        window.scrollTo(0, 0);
164a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
165a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (nextPageLink) {
166a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /**
167a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * Append any additional pages after a small timeout so that people
168a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * can start reading without having to wait for this to finish processing.
169a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            **/
170a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            window.setTimeout(function() {
171a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.appendNextPage(nextPageLink);
172a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }, 500);
173a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
174a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
175a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /** Smooth scrolling **/
176a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        document.onkeydown = function(e) {
177a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var code = (window.event) ? event.keyCode : e.keyCode;
178a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (code === 16) {
179a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.reversePageScroll = true;
180a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                return;
181a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
182a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
183a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (code === 32) {
184a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.curScrollStep = 0;
185a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);
186a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
187a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(readability.reversePageScroll) {
188a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
189a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
190a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                else {
191a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
192a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
193a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
194a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                return false;
195a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
196a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        };
197a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
198a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        document.onkeyup = function(e) {
199a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var code = (window.event) ? event.keyCode : e.keyCode;
200a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (code === 16) {
201a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.reversePageScroll = false;
202a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                return;
203a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
204a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        };
205a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
206a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
207a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
208a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Run any post-process modifications to article content as necessary.
209a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
210a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
211a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
212a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
213a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    postProcessContent: function(articleContent) {
214a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) {
215a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.addFootnotes(articleContent);
216a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
217a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
218a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.fixImageFloats(articleContent);
219a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
220a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
221a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
222a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Some content ends up looking ugly if the image is too large to be floated.
223a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * If the image is wider than a threshold (currently 55%), no longer float it,
224a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * center it instead.
225a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
226a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
227a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
228a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
229a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    fixImageFloats: function (articleContent) {
230a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55,
231a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            images              = articleContent.getElementsByTagName('img');
232a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
233a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for(var i=0, il = images.length; i < il; i+=1) {
234a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var image = images[i];
235a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
236a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(image.offsetWidth > imageWidthThreshold) {
237a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                image.className += " blockImage";
238a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
239a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
240a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
241a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
242a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
243a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Get the article tools Element that has buttons like reload, print.
244a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
245a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
246a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     **/
247a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    getArticleTools: function () {
248a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var articleTools = document.createElement("DIV");
249a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
250a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        articleTools.id        = "readTools";
251a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        articleTools.innerHTML =
252a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
253a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
254a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
255a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
256a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return articleTools;
257a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
258a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
259a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
260a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * retuns the suggested direction of the string
261a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
262a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return "rtl" || "ltr"
263a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     **/
264a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    getSuggestedDirection: function(text) {
265a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        function sanitizeText() {
266a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return text.replace(/@\w+/, "");
267a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
268a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
269a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        function countMatches(match) {
270a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var matches = text.match(new RegExp(match, "g"));
271a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return matches !== null ? matches.length : 0;
272a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
273a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
274a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        function isRTL() {
275a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var count_heb =  countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
276a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var count_arb =  countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
277a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
278a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            // if 20% of chars are Hebrew or Arbic then direction is rtl
279a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return  (count_heb + count_arb) * 100 / text.length > 20;
280a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
281a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
282a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        text  = sanitizeText(text);
283a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return isRTL() ? "rtl" : "ltr";
284a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
285a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
286a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
287a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Get the article title as an H1.
288a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
289a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
290a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     **/
291a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    getArticleTitle: function () {
292a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var curTitle = "",
293a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            origTitle = "";
294a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
295a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        try {
296a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            curTitle = origTitle = document.title;
297a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
298a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
299a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
300a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
301a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        catch(e) {}
302a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
303a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(curTitle.match(/ [\|\-] /))
304a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
305a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
306a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
307a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(curTitle.split(' ').length < 3) {
308a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
309a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
310a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
311a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        else if(curTitle.indexOf(': ') !== -1)
312a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
313a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
314a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
315a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(curTitle.split(' ').length < 3) {
316a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
317a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
318a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
319a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        else if(curTitle.length > 150 || curTitle.length < 15)
320a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
321a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var hOnes = document.getElementsByTagName('h1');
322a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(hOnes.length === 1)
323a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            {
324a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                curTitle = readability.getInnerText(hOnes[0]);
325a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
326a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
327a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
328a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        curTitle = curTitle.replace( readability.regexps.trim, "" );
329a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
330a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(curTitle.split(' ').length <= 4) {
331a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            curTitle = origTitle;
332a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
333a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return curTitle;
334a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
335a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
336a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
337a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Prepare the HTML document for readability to scrape it.
338a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * This includes things like stripping javascript, CSS, and handling terrible markup.
339a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
340a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
341a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     **/
342a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    prepDocument: function () {
343a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
344a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * In some cases a body element can't be found (if the HTML is totally hosed for example)
345a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * so we create a new body node and append it to the document.
346a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         */
347a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(document.body === null)
348a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
349a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var body = document.createElement("body");
350a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            try {
351a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                document.body = body;
352a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
353a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            catch(e) {
354a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                document.documentElement.appendChild(body);
355a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                dbg(e);
356a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
357a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
358a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
359a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        document.body.id = "readabilityBody";
360a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
361a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var frames = document.getElementsByTagName('frame');
362a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(frames.length > 0)
363a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
364a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var bestFrame = null;
365a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var bestFrameSize = 0;    /* The frame to try to run readability upon. Must be on same domain. */
366a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */
367a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1)
368a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            {
369a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
370a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var canAccessFrame = false;
371a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                try {
372a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    var frameBody = frames[frameIndex].contentWindow.document.body;
373a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    canAccessFrame = true;
374a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
375a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                catch(eFrames) {
376a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    dbg(eFrames);
377a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
378a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
379a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(frameSize > biggestFrameSize) {
380a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    biggestFrameSize         = frameSize;
381a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.biggestFrame = frames[frameIndex];
382a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
383a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
384a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(canAccessFrame && frameSize > bestFrameSize)
385a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                {
386a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.frameHack = true;
387a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
388a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    bestFrame = frames[frameIndex];
389a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    bestFrameSize = frameSize;
390a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
391a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
392a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
393a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(bestFrame)
394a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            {
395a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var newBody = document.createElement('body');
396a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody);
397a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                newBody.style.overflow = 'scroll';
398a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                document.body = newBody;
399a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
400a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var frameset = document.getElementsByTagName('frameset')[0];
401a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(frameset) {
402a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    frameset.parentNode.removeChild(frameset); }
403a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
404a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
405a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
406a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Remove all stylesheets */
407a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for (var k=0;k < document.styleSheets.length; k+=1) {
408a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) {
409a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                document.styleSheets[k].disabled = true;
410a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
411a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
412a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
413a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
414a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var styleTags = document.getElementsByTagName("style");
415a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for (var st=0;st < styleTags.length; st+=1) {
416a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            styleTags[st].textContent = "";
417a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
418a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
419a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Turn all double br's into p's */
420a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
421a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.replaceDoubleBrsWithPs(document.body);
422a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.replaceFontsWithSpans(document.body);
423a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
424a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
425a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
426a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
427a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Prepare the article node for display. Clean out any inline styles,
428a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * iframes, forms, strip extraneous <p> tags, etc.
429a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
430a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
431a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
432a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     **/
433a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    prepArticle: function (articleContent) {
434a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.cleanStyles(articleContent);
435a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.killBreaks(articleContent);
436a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
437a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Clean out junk from the article content */
438a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.cleanConditionally(articleContent, "form");
439a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.clean(articleContent, "object");
440a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.clean(articleContent, "h1");
441a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
442a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
443a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * If there is only one h2, they are probably using it
444a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * as a header and not a subheader, so remove it since we already have a header.
445a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        ***/
446a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(articleContent.getElementsByTagName('h2').length === 1) {
447a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.clean(articleContent, "h2");
448a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
449a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.clean(articleContent, "iframe");
450a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
451a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.cleanHeaders(articleContent);
452a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
453a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Do these last as the previous stuff may have removed junk that will affect these */
454a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.cleanConditionally(articleContent, "table");
455a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.cleanConditionally(articleContent, "ul");
456a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.cleanConditionally(articleContent, "div");
457a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
458a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Remove extra paragraphs */
459a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var articleParagraphs = articleContent.getElementsByTagName('p');
460a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
461a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var imgCount    = articleParagraphs[i].getElementsByTagName('img').length;
462a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var embedCount  = articleParagraphs[i].getElementsByTagName('embed').length;
463a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
464a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
465a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
466a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
467a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
468a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
469a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
470a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        try {
471a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.replaceBrsWithPs(articleContent);
472a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
473a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        catch (e) {
474a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
475a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
476a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
477a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
478a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
479a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Initialize a node with the readability object. Also checks the
480a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * className/id for special names to add to its score.
481a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
482a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
483a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
484a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
485a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    initializeNode: function (node) {
486a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        node.readability = {"contentScore": 0};
487a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
488a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        switch(node.tagName) {
489a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'DIV':
490a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                node.readability.contentScore += 5;
491a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                break;
492a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
493a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'PRE':
494a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'TD':
495a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'BLOCKQUOTE':
496a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                node.readability.contentScore += 3;
497a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                break;
498a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
499a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'ADDRESS':
500a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'OL':
501a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'UL':
502a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'DL':
503a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'DD':
504a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'DT':
505a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'LI':
506a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'FORM':
507a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                node.readability.contentScore -= 3;
508a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                break;
509a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
510a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'H1':
511a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'H2':
512a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'H3':
513a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'H4':
514a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'H5':
515a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'H6':
516a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            case 'TH':
517a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                node.readability.contentScore -= 5;
518a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                break;
519a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
520a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
521a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        node.readability.contentScore += readability.getClassWeight(node);
522a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
523a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
524a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /***
525a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
526a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
527a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
528a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param page a document to run upon. Needs to be a full document, complete with body.
529a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return Element
530a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
531a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    grabArticle: function (pageToClone) {
532a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
533a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            isPaging = (page !== null) ? true: false;
534a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
535a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var page = null;
536a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        // Never work on the actual page.
537a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (isPaging) {
538a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            page = document.body.cloneNode(true);
539a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        } else {
540a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            page = pageToClone.cloneNode(true);
541a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
542a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
543a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var allElements = page.getElementsByTagName('*');
544a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
545a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
546a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
547a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
548a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         *
549a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
550a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * TODO: Shouldn't this be a reverse traversal?
551a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
552a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var node = null;
553a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var nodesToScore = [];
554a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
555a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* Remove unlikely candidates */
556a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (stripUnlikelyCandidates) {
557a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var unlikelyMatchString = node.className + node.id;
558a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if (
559a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    (
560a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
561a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
562a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        node.tagName !== "BODY"
563a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    )
564a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                )
565a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                {
566a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    dbg("Removing unlikely candidate - " + unlikelyMatchString);
567a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    node.parentNode.removeChild(node);
568a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    nodeIndex-=1;
569a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    continue;
570a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
571a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
572a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
573a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
574a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                nodesToScore[nodesToScore.length] = node;
575a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
576a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
577a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* Turn all divs that don't have children block level elements into p's */
578a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (node.tagName === "DIV") {
579a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
580a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    var newNode = document.createElement('p');
581a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    try {
582a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        readability.moveNodeInnards(node, newNode);
583a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        node.parentNode.replaceChild(newNode, node);
584a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        nodeIndex-=1;
585a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
586a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        nodesToScore[nodesToScore.length] = node;
587a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
588a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    catch(e) {
589a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
590a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
591a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
592a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                else
593a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                {
594a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    /* EXPERIMENTAL */
595a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
596a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        var childNode = node.childNodes[i];
597a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        if(childNode.nodeType === 3) { // Node.TEXT_NODE
598a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            var p = document.createElement('p');
599a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            var t = document.createTextNode(childNode.nodeValue);
600a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            p.appendChild(t);
601a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            p.style.display = 'inline';
602a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            p.className = 'readability-styled';
603a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            childNode.parentNode.replaceChild(p, childNode);
604a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        }
605a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
606a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
607a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
608a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
609a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
610a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
611a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Loop through all paragraphs, and assign a score to them based on how content-y they look.
612a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Then add their score to their parent node.
613a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         *
614a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
615a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
616a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var candidates = [];
617a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for (var pt=0; pt < nodesToScore.length; pt+=1) {
618a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var parentNode      = nodesToScore[pt].parentNode;
619a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var grandParentNode = parentNode ? parentNode.parentNode : null;
620a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var innerText       = readability.getInnerText(nodesToScore[pt]);
621a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
622a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
623a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                continue;
624a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
625a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
626a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If this paragraph is less than 25 characters, don't even count it. */
627a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(innerText.length < 25) {
628a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                continue; }
629a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
630a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* Initialize readability data for the parent. */
631a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(typeof parentNode.readability === 'undefined') {
632a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.initializeNode(parentNode);
633a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                candidates.push(parentNode);
634a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
635a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
636a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* Initialize readability data for the grandparent. */
637a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
638a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.initializeNode(grandParentNode);
639a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                candidates.push(grandParentNode);
640a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
641a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
642a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var contentScore = 0;
643a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
644a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* Add a point for the paragraph itself as a base. */
645a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            contentScore+=1;
646a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
647a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* Add points for any commas within this paragraph */
648a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            contentScore += innerText.split(',').length;
649a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
650a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
651a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            contentScore += Math.min(Math.floor(innerText.length / 100), 3);
652a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
653a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* Add the score to the parent. The grandparent gets half. */
654a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            parentNode.readability.contentScore += contentScore;
655a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
656a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(grandParentNode) {
657a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                grandParentNode.readability.contentScore += contentScore/2;
658a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
659a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
660a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
661a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
662a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * After we've calculated scores, loop through all of the possible candidate nodes we found
663a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * and find the one with the highest score.
664a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
665a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var topCandidate = null;
666a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for(var c=0, cl=candidates.length; c < cl; c+=1)
667a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
668a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /**
669a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * Scale the final candidates score based on link density. Good content should have a
670a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * relatively small link density (5% or less) and be mostly unaffected by this operation.
671a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            **/
672a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
673a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
674a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
675a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
676a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
677a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                topCandidate = candidates[c]; }
678a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
679a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
680a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
681a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * If we still have no top candidate, just use the body as a last resort.
682a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * We also have to copy the body node so it is something we can modify.
683a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         **/
684a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (topCandidate === null || topCandidate.tagName === "BODY")
685a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
686a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            topCandidate = document.createElement("DIV");
687a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.replaceNodeInnards(page, topCandidate);
688a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            page.appendChild(topCandidate);
689a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.initializeNode(topCandidate);
690a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
691a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
692a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
693a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Now that we have the top candidate, look through its siblings for content that might also be related.
694a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Things like preambles, content split by ads that we removed, etc.
695a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
696a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var articleContent        = document.createElement("DIV");
697a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (isPaging) {
698a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            articleContent.id     = "readability-content";
699a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
700a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
701a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var siblingNodes          = topCandidate.parentNode.childNodes;
702a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
703a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
704a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
705a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var siblingNode = siblingNodes[s];
706a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var append      = false;
707a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
708a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /**
709a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
710a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * Example of error visible here: http://www.esquire.com/features/honesty0707
711a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            **/
712a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(!siblingNode) {
713a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                continue;
714a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
715a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
716a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
717a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
718a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
719a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(siblingNode === topCandidate)
720a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            {
721a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                append = true;
722a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
723a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
724a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var contentBonus = 0;
725a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* Give a bonus if sibling nodes and top candidates have the example same classname */
726a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
727a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                contentBonus += topCandidate.readability.contentScore * 0.2;
728a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
729a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
730a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
731a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            {
732a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                append = true;
733a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
734a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
735a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(siblingNode.nodeName === "P") {
736a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var linkDensity = readability.getLinkDensity(siblingNode);
737a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var nodeContent = readability.getInnerText(siblingNode);
738a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var nodeLength  = nodeContent.length;
739a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
740a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(nodeLength > 80 && linkDensity < 0.25)
741a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                {
742a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    append = true;
743a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
744a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
745a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                {
746a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    append = true;
747a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
748a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
749a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
750a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(append) {
751a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                dbg("Appending node: " + siblingNode);
752a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
753a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var nodeToAppend = null;
754a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
755a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
756a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
757a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
758a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    nodeToAppend = document.createElement("DIV");
759a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    try {
760a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        nodeToAppend.id = siblingNode.id;
761a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        readability.moveNodeInnards(siblingNode, nodeToAppend);
762a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
763a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    catch(er) {
764a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
765a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        nodeToAppend = siblingNode;
766a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        s-=1;
767a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        sl-=1;
768a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
769a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                } else {
770a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    nodeToAppend = siblingNode;
771a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    s-=1;
772a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    sl-=1;
773a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
774a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
775a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                /* To ensure a node does not interfere with readability styles, remove its classnames */
776a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                nodeToAppend.className = "";
777a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
778a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                /* Append sibling and subtract from our list because it removes the node when you append to another node */
779a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                articleContent.appendChild(nodeToAppend);
780a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
781a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
782a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
783a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
784a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * So we have all of the content that we need. Now we clean it up for presentation.
785a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
786a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.distilledArticleContent = articleContent.cloneNode(true);
787a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        //readability.prepArticle(articleContent);
788a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
789a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (readability.curPageNum === 1) {
790a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var newNode = document.createElement('div');
791a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            newNode.id = "readability-page-1";
792a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            newNode.setAttribute("class", "page");
793a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.moveNodeInnards(articleContent, newNode);
794a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            articleContent.appendChild(newNode);
795a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
796a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
797a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
798a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
799a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
800a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
801a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * finding the -right- content.
802a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
803a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(readability.getInnerText(articleContent, false).length < 250) {
804a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
805a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
806a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                return readability.grabArticle(document.body);
807a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
808a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
809a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
810a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                return readability.grabArticle(document.body);
811a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
812a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
813a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
814a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                return readability.grabArticle(document.body);
815a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            } else {
816a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                return null;
817a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
818a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
819a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
820a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return articleContent;
821a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
822a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
823a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
824a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Removes script tags from the document.
825a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
826a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
827a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
828a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    removeScripts: function (doc) {
829a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var scripts = doc.getElementsByTagName('script');
830a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for(var i = scripts.length-1; i >= 0; i-=1)
831a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
832a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
833a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            {
834a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                scripts[i].nodeValue="";
835a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                scripts[i].removeAttribute('src');
836a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if (scripts[i].parentNode) {
837a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        scripts[i].parentNode.removeChild(scripts[i]);
838a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
839a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
840a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
841a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
842a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
843a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
844a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Get the inner text of a node - cross browser compatibly.
845a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * This also strips out any excess whitespace to be found.
846a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
847a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
848a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return string
849a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
850a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    getInnerText: function (e, normalizeSpaces) {
851a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var textContent    = "";
852a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
853a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") {
854a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return "";
855a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
856a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
857a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
858a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
859a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (navigator.appName === "Microsoft Internet Explorer") {
860a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            textContent = e.innerText.replace( readability.regexps.trim, "" ); }
861a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        else {
862a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            textContent = e.textContent.replace( readability.regexps.trim, "" ); }
863a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
864a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(normalizeSpaces) {
865a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return textContent.replace( readability.regexps.normalize, " "); }
866a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        else {
867a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return textContent; }
868a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
869a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
870a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
871a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Get the number of times a string s appears in the node e.
872a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
873a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
874a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param string - what to split on. Default is ","
875a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return number (integer)
876a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
877a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    getCharCount: function (e,s) {
878a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        s = s || ",";
879a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return readability.getInnerText(e).split(s).length-1;
880a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
881a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
882a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
883a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Remove the style attribute on every e and under.
884a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * TODO: Test if getElementsByTagName(*) is faster.
885a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
886a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
887a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
888a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
889a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    cleanStyles: function (e) {
890a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        e = e || document;
891a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var cur = e.firstChild;
892a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
893a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(!e) {
894a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return; }
895a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
896a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        // Remove any root styles, if we're able.
897a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') {
898a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            e.removeAttribute('style'); }
899a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
900a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        // Go until there are no more child nodes
901a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        while ( cur !== null ) {
902a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if ( cur.nodeType === 1 ) {
903a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                // Remove style attribute(s) :
904a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(cur.className !== "readability-styled") {
905a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    cur.removeAttribute("style");
906a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
907a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                readability.cleanStyles( cur );
908a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
909a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            cur = cur.nextSibling;
910a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
911a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
912a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
913a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
914a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Get the density of links as a percentage of the content
915a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * This is the amount of text that is inside a link divided by the total text in the node.
916a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
917a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
918a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return number (float)
919a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
920a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    getLinkDensity: function (e) {
921a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var links      = e.getElementsByTagName("a");
922a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var textLength = readability.getInnerText(e).length;
923a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var linkLength = 0;
924a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for(var i=0, il=links.length; i<il;i+=1)
925a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
926a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            linkLength += readability.getInnerText(links[i]).length;
927a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
928a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
929a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return linkLength / textLength;
930a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
931a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
932a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
933a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
934a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
935a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @author Dan Lacy
936a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return string the base url
937a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
938a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    findBaseUrl: function () {
939a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var noUrlParams     = window.location.pathname.split("?")[0],
940a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            urlSlashes      = noUrlParams.split("/").reverse(),
941a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            cleanedSegments = [],
942a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            possibleType    = "";
943a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
944a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
945a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var segment = urlSlashes[i];
946a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
947a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            // Split off and save anything that looks like a file type.
948a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (segment.indexOf(".") !== -1) {
949a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                possibleType = segment.split(".")[1];
950a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
951a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                /* If the type isn't alpha-only, it's probably not actually a file extension. */
952a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(!possibleType.match(/[^a-zA-Z]/)) {
953a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    segment = segment.split(".")[0];
954a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
955a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
956a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
957a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /**
958a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * EW-CMS specific segment replacement. Ugly.
959a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
960a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            **/
961a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(segment.indexOf(',00') !== -1) {
962a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                segment = segment.replace(',00', '');
963a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
964a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
965a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            // If our first or second segment has anything looking like a page number, remove it.
966a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
967a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
968a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
969a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
970a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
971a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var del = false;
972a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
973a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
974a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (i < 2 && segment.match(/^\d{1,2}$/)) {
975a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                del = true;
976a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
977a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
978a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If this is the first segment and it's just "index", remove it. */
979a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(i === 0 && segment.toLowerCase() === "index") {
980a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                del = true;
981a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
982a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
983a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
984a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
985a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
986a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                del = true;
987a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
988a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
989a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If it's not marked for deletion, push it to cleanedSegments. */
990a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (!del) {
991a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                cleanedSegments.push(segment);
992a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
993a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
994a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
995a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        // This is our final, cleaned, base article URL.
996a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/");
997a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
998a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
999a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
1000a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Look for any paging links that may occur within the document.
1001a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
1002a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param body
1003a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return object (array)
1004a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
1005a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    findNextPageLink: function (elem) {
1006a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var possiblePages = {},
1007a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            allLinks = elem.getElementsByTagName('a'),
1008a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            articleBaseUrl = readability.findBaseUrl();
1009a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1010a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
1011a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Loop through all links, looking for hints that they may be next-page links.
1012a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Things like having "page" in their textContent, className or id, or being a child
1013a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * of a node with a page-y className or id.
1014a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         *
1015a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Also possible: levenshtein distance? longest common subsequence?
1016a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         *
1017a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * After we do that, assign each page a score, and
1018a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
1019a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for(var i = 0, il = allLinks.length; i < il; i+=1) {
1020a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var link     = allLinks[i],
1021a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
1022a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1023a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If we've already seen this page, ignore it */
1024a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {
1025a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                continue;
1026a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1027a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1028a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If it's on a different domain, skip it. */
1029a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(window.location.host !== linkHref.split(/\/+/g)[1]) {
1030a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                continue;
1031a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1032a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1033a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var linkText = readability.getInnerText(link);
1034a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1035a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If the linkText looks like it's not the next page, skip it. */
1036a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
1037a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                continue;
1038a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1039a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1040a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
1041a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
1042a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(!linkHrefLeftover.match(/\d/)) {
1043a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                continue;
1044a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1045a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1046a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(!(linkHref in possiblePages)) {
1047a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
1048a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            } else {
1049a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                possiblePages[linkHref].linkText += ' | ' + linkText;
1050a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1051a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1052a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var linkObj = possiblePages[linkHref];
1053a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1054a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /**
1055a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
1056a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
1057a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            **/
1058a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(linkHref.indexOf(articleBaseUrl) !== 0) {
1059a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                linkObj.score -= 25;
1060a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1061a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1062a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var linkData = linkText + ' ' + link.className + ' ' + link.id;
1063a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(linkData.match(readability.regexps.nextLink)) {
1064a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                linkObj.score += 50;
1065a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1066a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(linkData.match(/pag(e|ing|inat)/i)) {
1067a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                linkObj.score += 25;
1068a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1069a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
1070a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
1071a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(!linkObj.linkText.match(readability.regexps.nextLink)) {
1072a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    linkObj.score -= 65;
1073a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1074a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1075a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) {
1076a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                linkObj.score -= 50;
1077a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1078a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(linkData.match(readability.regexps.prevLink)) {
1079a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                linkObj.score -= 200;
1080a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1081a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1082a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If a parentNode contains page or paging or paginat */
1083a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var parentNode = link.parentNode,
1084a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                positiveNodeMatch = false,
1085a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                negativeNodeMatch = false;
1086a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            while(parentNode) {
1087a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
1088a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
1089a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    positiveNodeMatch = true;
1090a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    linkObj.score += 25;
1091a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1092a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) {
1093a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
1094a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    if(!parentNodeClassAndId.match(readability.regexps.positive)) {
1095a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        linkObj.score -= 25;
1096a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        negativeNodeMatch = true;
1097a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
1098a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1099a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1100a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                parentNode = parentNode.parentNode;
1101a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1102a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1103a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /**
1104a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * If the URL looks like it has paging in it, add to the score.
1105a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
1106a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            **/
1107a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
1108a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                linkObj.score += 25;
1109a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1110a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1111a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* If the URL contains negative values, give a slight decrease. */
1112a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (linkHref.match(readability.regexps.extraneous)) {
1113a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                linkObj.score -= 15;
1114a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1115a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1116a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /**
1117a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * Minor punishment to anything that doesn't match our current URL.
1118a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
1119a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             *       Dan, can you show me a counterexample where this is necessary?
1120a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * if (linkHref.indexOf(window.location.href) !== 0) {
1121a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             *    linkObj.score -= 1;
1122a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * }
1123a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            **/
1124a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1125a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /**
1126a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * If the link text can be parsed as a number, give it a minor bonus, with a slight
1127a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * bias towards lower numbered pages. This is so that pages that might not have 'next'
1128a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)             * in their text can still get scored, and sorted properly by score.
1129a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            **/
1130a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var linkTextAsNumber = parseInt(linkText, 10);
1131a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(linkTextAsNumber) {
1132a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                // Punish 1 since we're either already there, or it's probably before what we want anyways.
1133a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if (linkTextAsNumber === 1) {
1134a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    linkObj.score -= 10;
1135a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1136a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                else {
1137a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    // Todo: Describe this better
1138a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    linkObj.score += Math.max(0, 10 - linkTextAsNumber);
1139a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1140a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1141a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1142a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1143a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
1144a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL.
1145a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
1146a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
1147a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var topPage = null;
1148a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for(var page in possiblePages) {
1149a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(possiblePages.hasOwnProperty(page)) {
1150a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) {
1151a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    topPage = possiblePages[page];
1152a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1153a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1154a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1155a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1156a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(topPage) {
1157a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var nextHref = topPage.href.replace(/\/$/,'');
1158a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1159a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            dbg('NEXT PAGE IS ' + nextHref);
1160a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.parsedPages[nextHref] = true;
1161a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return nextHref;
1162a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1163a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        else {
1164a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return null;
1165a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1166a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1167a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1168a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    createLinkDiv: function(link) {
1169a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var divNode = document.createElement('div');
1170a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var aNode = document.createElement('a');
1171a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var tNode = document.createTextNode('View Next Page');
1172a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        divNode.setAttribute('style', 'text-align: center');
1173a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        aNode.setAttribute('href', link);
1174a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        aNode.appendChild(tNode);
1175a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        divNode.appendChild(aNode);
1176a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return divNode;
1177a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1178a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1179a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    xhr: function () {
1180a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
1181a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return new XMLHttpRequest();
1182a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1183a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        else {
1184a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { }
1185a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { }
1186a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { }
1187a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1188a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1189a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return false;
1190a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1191a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1192a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    successfulRequest: function (request) {
1193a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText);
1194a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1195a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1196a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    ajax: function (url, options) {
1197a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var request = readability.xhr();
1198a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1199a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        function respondToReadyState(readyState) {
1200a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (request.readyState === 4) {
1201a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if (readability.successfulRequest(request)) {
1202a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    if (options.success) { options.success(request); }
1203a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1204a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                else {
1205a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    if (options.error) { options.error(request); }
1206a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1207a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1208a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1209a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1210a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (typeof options === 'undefined') { options = {}; }
1211a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1212a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        request.onreadystatechange = respondToReadyState;
1213a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1214a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        request.open('get', url, true);
1215a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        request.setRequestHeader('Accept', 'text/html');
1216a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1217a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        try {
1218a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            request.send(options.postBody);
1219a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1220a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        catch (e) {
1221a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (options.error) { options.error(); }
1222a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1223a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1224a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return request;
1225a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1226a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1227a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
1228a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Make an AJAX request for each page and append it to the document.
1229a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
1230a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    curPageNum: 1,
1231a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1232a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    appendNextPage: function (nextPageLink) {
1233a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.curPageNum+=1;
1234a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1235a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var articlePage       = document.createElement("DIV");
1236a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        articlePage.id        = 'readability-page-' + readability.curPageNum;
1237a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        articlePage.className = 'page';
1238a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">&sect;</p>';
1239a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1240a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        document.getElementById("readability-content").appendChild(articlePage);
1241a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1242a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(readability.curPageNum > readability.maxPages) {
1243a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var linkDiv = readability.createLinkDiv(nextPageLink);
1244a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1245a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            articlePage.appendChild(linkDiv);
1246a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return;
1247a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1248a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1249a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
1250a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Now that we've built the article page DOM element, get the page content
1251a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * asynchronously and load the cleaned content into the div we created for it.
1252a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
1253a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        (function(pageUrl, thisPage) {
1254a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.ajax(pageUrl, {
1255a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                success: function(r) {
1256a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1257a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
1258a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    var eTag = r.getResponseHeader('ETag');
1259a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    if(eTag) {
1260a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        if(eTag in readability.pageETags) {
1261a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            dbg("Exact duplicate page found via ETag. Aborting.");
1262a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            articlePage.style.display = 'none';
1263a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            return;
1264a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        } else {
1265a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            readability.pageETags[eTag] = 1;
1266a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        }
1267a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
1268a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1269a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
1270a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    var page = document.createElement("DIV");
1271a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1272a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    /**
1273a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * Do some preprocessing to our HTML to make it ready for appending.
1274a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
1275a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript.
1276a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * • Turn all double br's into p's - was handled by prepDocument in the original view.
1277a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     *   Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages.
1278a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    **/
1279a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    var pageInnards = r.responseXML;
1280a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.removeScripts(pageInnards);
1281a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.replaceNoscriptsWithPs(pageInnards);
1282a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.replaceDoubleBrsWithPs(pageInnards);
1283a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.replaceFontsWithSpans(pageInnards);
1284a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    page.appendChild(pageInnards);
1285a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1286a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1287a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    /**
1288a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle.
1289a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    **/
1290a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.flags = 0x1 | 0x2 | 0x4;
1291a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1292a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    var nextPageLink = readability.findNextPageLink(page),
1293a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        content      =  readability.grabArticle(page);
1294a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1295a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    if(!content) {
1296a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        dbg("No content found in page to append. Aborting.");
1297a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        return;
1298a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
1299a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1300a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    /**
1301a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
1302a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * Compare it against all of the the previous document's we've gotten. If the previous
1303a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
1304a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    **/
1305a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
1306a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    if(firstP && firstP.innerHTML.length > 100) {
1307a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        for(var i=1; i <= readability.curPageNum; i+=1) {
1308a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            var rPage = document.getElementById('readability-page-' + i);
1309a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
1310a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                                dbg('Duplicate of page ' + i + ' - skipping.');
1311a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                                articlePage.style.display = 'none';
1312a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                                readability.parsedPages[pageUrl] = true;
1313a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                                return;
1314a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                            }
1315a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        }
1316a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
1317a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1318a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.removeScripts(content);
1319a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1320a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    readability.moveNodeInnards(content, thisPage);
1321a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1322a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    /**
1323a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * After the page has rendered, post process the content. This delay is necessary because,
1324a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * in webkit at least, offsetWidth is not set in time to determine image width. We have to
1325a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                     * wait a little bit for reflow to finish before we can fix floating images.
1326a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    **/
1327a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    window.setTimeout(
1328a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        function() { readability.postProcessContent(thisPage); },
1329a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        500
1330a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    );
1331a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1332a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    if(nextPageLink) {
1333a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        readability.appendNextPage(nextPageLink);
1334a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
1335a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1336a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            });
1337a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }(nextPageLink, articlePage));
1338a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1339a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1340a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
1341a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Get an elements class/id weight. Uses regular expressions to tell if this
1342a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * element looks good or bad.
1343a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
1344a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
1345a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return number (Integer)
1346a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
1347a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    getClassWeight: function (e) {
1348a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
1349a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return 0;
1350a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1351a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1352a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var weight = 0;
1353a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1354a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Look for a special classname */
1355a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (typeof(e.className) === 'string' && e.className !== '')
1356a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
1357a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(e.className.search(readability.regexps.negative) !== -1) {
1358a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                weight -= 25; }
1359a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1360a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(e.className.search(readability.regexps.positive) !== -1) {
1361a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                weight += 25; }
1362a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1363a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1364a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /* Look for a special ID */
1365a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (typeof(e.id) === 'string' && e.id !== '')
1366a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        {
1367a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(e.id.search(readability.regexps.negative) !== -1) {
1368a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                weight -= 25; }
1369a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1370a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(e.id.search(readability.regexps.positive) !== -1) {
1371a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                weight += 25; }
1372a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1373a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1374a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return weight;
1375a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1376a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1377a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    nodeIsVisible: function (node) {
1378a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none';
1379a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1380a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1381a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
1382a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Remove extraneous break tags from a node.
1383a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
1384a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
1385a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
1386a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     **/
1387a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    killBreaks: function (e) {
1388a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var allElements = e.getElementsByTagName('*');
1389a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        while (i < allElements.length) {
1390a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.deleteExtraBreaks(allElements[i]);
1391a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            i++;
1392a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1393a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1394a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1395a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
1396a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Clean a node of all elements of type "tag".
1397a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * (Unless it's a youtube/vimeo video. People love movies.)
1398a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
1399a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
1400a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param string tag to clean
1401a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
1402a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     **/
1403a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    clean: function (e, tag) {
1404a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var targetList = e.getElementsByTagName( tag );
1405a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var isEmbed    = (tag === 'object' || tag === 'embed');
1406a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1407a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for (var y=targetList.length-1; y >= 0; y-=1) {
1408a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            /* Allow youtube and vimeo videos through as people usually want to see those. */
1409a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(isEmbed) {
1410a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var attributeValues = "";
1411a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
1412a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    attributeValues += targetList[y].attributes[i].value + '|';
1413a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1414a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1415a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                /* First, check the elements attributes to see if any of them contain youtube or vimeo */
1416a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if (attributeValues.search(readability.regexps.videos) !== -1) {
1417a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    continue;
1418a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1419a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1420a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                /* Then check the elements inside this element for the same. */
1421a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
1422a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    continue;
1423a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1424a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1425a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1426a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1427a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            targetList[y].parentNode.removeChild(targetList[y]);
1428a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1429a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1430a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1431a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
1432a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Clean an element of all tags of type "tag" if they look fishy.
1433a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
1434a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
1435a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
1436a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     **/
1437a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    cleanConditionally: function (e, tag) {
1438a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1439a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
1440a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return;
1441a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1442a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1443a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var tagsList      = e.getElementsByTagName(tag);
1444a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var curTagsLength = tagsList.length;
1445a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1446a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        /**
1447a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Gather counts for other typical elements embedded within.
1448a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1449a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         *
1450a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         * TODO: Consider taking into account original contentScore here.
1451a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        **/
1452a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for (var i=curTagsLength-1; i >= 0; i-=1) {
1453a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var weight = readability.getClassWeight(tagsList[i]);
1454a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
1455a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1456a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
1457a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1458a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if(weight+contentScore < 0)
1459a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            {
1460a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                tagsList[i].parentNode.removeChild(tagsList[i]);
1461a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1462a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            else if ( readability.getCharCount(tagsList[i],',') < 10) {
1463a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                /**
1464a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                 * If there are not very many commas, and the number of
1465a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1466a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                **/
1467a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var p      = tagsList[i].getElementsByTagName("p").length;
1468a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var img    = tagsList[i].getElementsByTagName("img").length;
1469a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var li     = tagsList[i].getElementsByTagName("li").length-100;
1470a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var input  = tagsList[i].getElementsByTagName("input").length;
1471a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1472a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var embedCount = 0;
1473a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var embeds     = tagsList[i].getElementsByTagName("embed");
1474a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                for(var ei=0,il=embeds.length; ei < il; ei+=1) {
1475a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    if (embeds[ei].src.search(readability.regexps.videos) === -1) {
1476a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                      embedCount+=1;
1477a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
1478a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1479a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1480a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var linkDensity   = readability.getLinkDensity(tagsList[i]);
1481a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var contentLength = readability.getInnerText(tagsList[i]).length;
1482a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var toRemove      = false;
1483a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1484a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if ( img > p ) {
1485a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    toRemove = true;
1486a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                } else if(li > p && tag !== "ul" && tag !== "ol") {
1487a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    toRemove = true;
1488a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                } else if( input > Math.floor(p/3) ) {
1489a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    toRemove = true;
1490a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                } else if(contentLength < 25 && (img === 0 || img > 2) ) {
1491a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    toRemove = true;
1492a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                } else if(weight < 25 && linkDensity > 0.2) {
1493a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    toRemove = true;
1494a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                } else if(weight >= 25 && linkDensity > 0.5) {
1495a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    toRemove = true;
1496a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
1497a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    toRemove = true;
1498a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1499a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1500a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if(toRemove) {
1501a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    tagsList[i].parentNode.removeChild(tagsList[i]);
1502a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1503a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1504a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1505a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1506a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1507a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    /**
1508a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * Clean out spurious headers from an Element. Checks things like classnames and link density.
1509a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     *
1510a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @param Element
1511a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)     * @return void
1512a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    **/
1513a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    cleanHeaders: function (e) {
1514a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) {
1515a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var headers = e.getElementsByTagName('h' + headerIndex);
1516a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            for (var i=headers.length-1; i >=0; i-=1) {
1517a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
1518a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    headers[i].parentNode.removeChild(headers[i]);
1519a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1520a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1521a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1522a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1523a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1524a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    flagIsActive: function(flag) {
1525a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return (readability.flags & flag) > 0;
1526a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1527a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1528a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    addFlag: function(flag) {
1529a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.flags = readability.flags | flag;
1530a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1531a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1532a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    removeFlag: function(flag) {
1533a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.flags = readability.flags & ~flag;
1534a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1535a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1536a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Removes the children of |src| and appends them to |dest|.
1537a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    moveNodeInnards: function(src, dest) {
1538a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        try {
1539a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            while (src.firstChild) {
1540a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                dest.appendChild(src.removeChild(src.firstChild));
1541a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1542a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        } catch (e) {}
1543a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1544a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1545a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Returns true if the node is a whitespace text node.
1546a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    isWhitespaceNode: function(node) {
1547a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (node.nodeType == Node.TEXT_NODE) {
1548a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (node.data.trim().length == 0) {
1549a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)               return true;
1550a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1551a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1552a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return false;
1553a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1554a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1555a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Returns true if the node is a <BR>.
1556a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    isBrNode: function(node) {
1557a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return (node.tagName === 'BR');
1558a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1559a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1560a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1561a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Returns the last <BR> node in a sequence of <BR> nodes that are only
1562a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // separated by whitespace, or null if there are not at least two <BR> tags
1563a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // in the sibling chain starting with |node|. Returns the second such <BR>
1564a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // node if |restrictToTwo| is true.
1565a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    isMultipleBr: function(node, restrictToTwo) {
1566a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var lastBr = null;
1567a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (!readability.isBrNode(node)) {
1568a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return lastBr;
1569a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1570a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var curr = node.nextSibling;
1571a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        while (curr) {
1572a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) {
1573a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                lastBr = curr;
1574a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                curr = curr.nextSibling;
1575a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                if (restrictToTwo) {
1576a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    if (readability.isBrNode(lastBr)) {
1577a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                        return lastBr;
1578a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                    }
1579a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                }
1580a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                continue;
1581a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1582a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            break;
1583a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1584a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return lastBr;
1585a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1586a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1587a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Removes all <BR> nodes except one and whitespace in between in a series
1588a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // of <BR> nodes.
1589a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    deleteExtraBreaks: function(node) {
1590a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var lastBr = readability.isMultipleBr(node, false);
1591a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var ret = false;
1592a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        while (lastBr && lastBr != node) {
1593a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var toRemove = lastBr;
1594a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            lastBr = lastBr.previousSibling;
1595a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            toRemove.parentNode.removeChild(toRemove);
1596a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            ret = true;
1597a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1598a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return ret;
1599a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1600a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1601a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
1602a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // <P> node, and makes all next siblings of that pair children of <P>, up
1603a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // until the next pair of <BR> nodes is reached.
1604a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    replaceDoubleBrWithP: function(node) {
1605a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        // Check that we are starting with a BR.
1606a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var second = readability.isMultipleBr(node, true);
1607a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (!second) {
1608a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return;
1609a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1610a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        // Make all next siblings of the second BR into children of a P.
1611a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var p = document.createElement('p');
1612a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var curr = second.nextSibling;
1613a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        while (curr) {
1614a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (readability.isMultipleBr(curr, true)) {
1615a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                break;
1616a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1617a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var next = curr.nextSibling;
1618a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            p.appendChild(curr.parentNode.removeChild(curr));
1619a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            curr = next;
1620a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1621a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var ret = curr;
1622a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1623a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        // Remove all nodes between the first and second BR.
1624a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        curr = node.nextSibling;
1625a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        while (curr && curr != second) {
1626a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var next = curr.nextSibling;
1627a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            curr.parentNode.removeChild(curr);
1628a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            curr = next;
1629a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1630a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        // Remove the second BR.
1631a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        second.parentNode.removeChild(second);
1632a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        // Replace the first BR with the P.
1633a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        node.parentNode.replaceChild(p, node);
1634a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1635a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return ret;
1636a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1637a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1638a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Returns true if the NodeList contains a double <BR>.
1639a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    hasDoubleBr: function(nodeList) {
1640a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for (var i = 0; i < nodeList.length; nodeList++) {
1641a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (readability.isMultipleBr(nodeList[i], true)) {
1642a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                return true;
1643a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1644a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1645a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return false;
1646a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1647a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1648a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Replaces double <BR> tags with <P> tags.
1649a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    replaceDoubleBrsWithPs: function(node) {
1650a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var allElements = node.getElementsByTagName('BR');
1651a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var node = null;
1652a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        while (allElements && allElements.length > 0 &&
1653a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)               readability.hasDoubleBr(allElements)) {
1654a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
1655a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var next = node;
1656a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                while (next = readability.replaceDoubleBrWithP(next));
1657a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1658a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            allElements = document.body.getElementsByTagName('BR');
1659a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1660a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1661a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1662a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1663a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Replaces a BR and the whitespace that follows it with a P.
1664a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    replaceBrWithP: function(node) {
1665a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        if (!readability.isBrNode(node)) {
1666a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            return;
1667a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1668a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var p = document.createElement('p');
1669a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var curr = node.nextSibling;
1670a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        while (curr && !isBrNode(curr)) {
1671a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var next = curr.nextSibling;
1672a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            if (readability.isWhitespaceNode(curr)) {
1673a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                curr.parentNode.removeChild(curr);
1674a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            } else {
1675a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                p.appendChild(curr.parentNode.removeChild(curr));
1676a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1677a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            curr = next;
1678a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1679a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        node.parentNode.replaceChild(p, node);
1680a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return curr;
1681a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1682a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1683a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag
1684a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // children of the <P>.
1685a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    replaceBrsWithPs: function(node) {
1686a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var allElements = node.getElementsByTagName('BR');
1687a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var node = null;
1688a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        while (allElements && allElements.length > 0) {
1689a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
1690a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                var next = node;
1691a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)                while (next = readability.replaceBrWithP(next));
1692a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            }
1693a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            allElements = document.body.getElementsByTagName('BR');
1694a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1695a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1696a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1697a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Replaces any tag with any other tag.
1698a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    replaceTagsWithTags: function(node, srcTag, destTag) {
1699a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var allElements = node.getElementsByTagName(srcTag);
1700a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for (var i = 0; i < allElements.length; i++) {
1701a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            var dest = document.createElement(destTag);
1702a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            readability.moveNodeInnards(allElements[i], dest);
1703a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            allElements[i].parentNode.replaceChild(dest, allElements[i]);
1704a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1705a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1706a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1707a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Replaces all <noscript> tags with <p> tags.
1708a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    replaceNoscriptsWithPs: function(node) {
1709a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.replaceTagsWithTags(node, 'noscript', 'p');
1710a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1711a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1712a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Replaces all <font> tags with <span> tags.
1713a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    replaceFontsWithSpans: function(node) {
1714a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        readability.replaceTagsWithTags(node, 'font', 'span');
1715a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1716a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1717a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Returns a list of image URLs in the distilled article.
1718a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    getImages : function() {
1719a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var images = document.getElementsByTagName('img');
1720a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        var result = new Array(images.length);
1721a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        dbg("Number of images: " + images.length);
1722a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        for(i = 0; i < images.length; i++) {
1723a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            result[i] = images[i].src;
1724a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)            dbg("Image: " + result[i]);
1725a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        }
1726a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return result;
1727a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1728a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1729a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Returns the distilled article HTML from the page(s).
1730a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    getDistilledArticleHTML : function() {
1731a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return readability.distilledHTML;
1732a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    },
1733a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1734a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    // Returns the next page of this article.
1735a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    getNextPageLink : function() {
1736a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        return readability.nextPageLink;
1737a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    }
1738a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)};
1739