1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Local modifications to this file are described in the README.chromium
6// file.
7
8var dbg = (typeof console !== 'undefined') ? function(s) {
9    console.log("Readability: " + s);
10} : function() {};
11
12/*
13 * Readability. An Arc90 Lab Experiment.
14 * Website: http://lab.arc90.com/experiments/readability
15 * Source:  http://code.google.com/p/arc90labs-readability
16 *
17 * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
18 *
19 * Copyright (c) 2010 Arc90 Inc
20 * Readability is licensed under the Apache License, Version 2.0.
21**/
22var readability = {
23    readStyle: "style-newspaper",
24    readSize: "size-medium",
25    readMargin: "margin-wide",
26
27    distilledHTML: '',
28    distilledArticleContent: null,
29    nextPageLink: '',
30
31    version:                '1.7.1',
32    iframeLoads:             0,
33    convertLinksToFootnotes: false,
34    reversePageScroll:       false, /* If they hold shift and hit space, scroll up */
35    frameHack:               false, /**
36                                      * The frame hack is to workaround a firefox bug where if you
37                                      * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
38                                      * So we fake a scrollbar in the wrapping div.
39                                     **/
40    biggestFrame:            false,
41    flags:                   0x1 | 0x2 | 0x4,   /* Start with all flags set. */
42
43    /* constants */
44    FLAG_STRIP_UNLIKELYS:     0x1,
45    FLAG_WEIGHT_CLASSES:      0x2,
46    FLAG_CLEAN_CONDITIONALLY: 0x4,
47
48    maxPages:    30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
49    parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
50    pageETags:   {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
51
52    /**
53     * All of the regular expressions in use within readability.
54     * Defined up here so we don't instantiate them repeatedly in loops.
55     **/
56    regexps: {
57        unlikelyCandidates:    /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
58        okMaybeItsACandidate:  /and|article|body|column|main|shadow/i,
59        positive:              /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
60        negative:              /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
61        extraneous:            /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
62        divToPElements:        /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
63        replaceBrs:            /(<br[^>]*>[ \n\r\t]*){2,}/gi,
64        replaceFonts:          /<(\/?)font[^>]*>/gi,
65        trim:                  /^\s+|\s+$/g,
66        normalize:             /\s{2,}/g,
67        killBreaks:            /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
68        videos:                /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
69        skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
70        nextLink:              /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
71        prevLink:              /(prev|earl|old|new|<|«)/i
72    },
73
74    /**
75     * Runs readability.
76     *
77     * Workflow:
78     *  1. Prep the document by removing script tags, css, etc.
79     *  2. Build readability's DOM tree.
80     *  3. Grab the article content from the current dom tree.
81     *  4. Replace the current DOM tree with the new one.
82     *  5. Read peacefully.
83     *
84     * @return void
85     **/
86    init: function() {
87        /* Before we do anything, remove all scripts that are not readability. */
88        window.onload = window.onunload = function() {};
89
90        readability.removeScripts(document);
91
92        /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
93        readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
94
95        /* Pull out any possible next page link first */
96        readability.nextPageLink = readability.findNextPageLink(document.body);
97
98        /* We handle processing of nextPage from C++ set nextPageLink to null */
99        var nextPageLink = null;
100
101        readability.prepDocument();
102
103        /* Build readability's DOM tree */
104        var overlay        = document.createElement("DIV");
105        var innerDiv       = document.createElement("DIV");
106        var articleTools   = readability.getArticleTools();
107        var articleTitleText   = readability.getArticleTitle();
108        var articleContent = readability.grabArticle();
109
110        if(!articleContent) {
111            articleContent    = document.createElement("DIV");
112            articleContent.id = "readability-content";
113            articleContent.innerHTML = [
114                "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>",
115                (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""),
116                "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>"
117            ].join('');
118
119            nextPageLink = null;
120        }
121
122        overlay.id              = "readOverlay";
123        innerDiv.id             = "readInner";
124
125        /* Apply user-selected styling */
126        document.body.className = readability.readStyle;
127        document.dir            = readability.getSuggestedDirection(articleTitleText);
128
129        if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){
130            overlay.className = readability.readStyle + " rdbTypekit";
131        } else {
132            overlay.className = readability.readStyle;
133        }
134        innerDiv.className    = readability.readMargin + " " + readability.readSize;
135
136        if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) {
137            readability.convertLinksToFootnotes = true;
138        }
139
140        readability.distilledHTML = articleContent.innerHTML;
141
142        if(readability.frameHack) {
143            var readOverlay = document.getElementById('readOverlay');
144            readOverlay.style.height = '100%';
145            readOverlay.style.overflow = 'auto';
146        }
147
148        /**
149         * If someone tries to use Readability on a site's root page, give them a warning about usage.
150        **/
151        if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) {
152            articleContent.style.display = "none";
153            var rootWarning = document.createElement('p');
154                rootWarning.id = "readability-warning";
155                rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
156                "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
157
158            innerDiv.insertBefore( rootWarning, articleContent );
159        }
160
161        readability.postProcessContent(articleContent);
162
163        window.scrollTo(0, 0);
164
165        if (nextPageLink) {
166            /**
167             * Append any additional pages after a small timeout so that people
168             * can start reading without having to wait for this to finish processing.
169            **/
170            window.setTimeout(function() {
171                readability.appendNextPage(nextPageLink);
172            }, 500);
173        }
174
175        /** Smooth scrolling **/
176        document.onkeydown = function(e) {
177            var code = (window.event) ? event.keyCode : e.keyCode;
178            if (code === 16) {
179                readability.reversePageScroll = true;
180                return;
181            }
182
183            if (code === 32) {
184                readability.curScrollStep = 0;
185                var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);
186
187                if(readability.reversePageScroll) {
188                    readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
189                }
190                else {
191                    readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
192                }
193
194                return false;
195            }
196        };
197
198        document.onkeyup = function(e) {
199            var code = (window.event) ? event.keyCode : e.keyCode;
200            if (code === 16) {
201                readability.reversePageScroll = false;
202                return;
203            }
204        };
205    },
206
207    /**
208     * Run any post-process modifications to article content as necessary.
209     *
210     * @param Element
211     * @return void
212    **/
213    postProcessContent: function(articleContent) {
214        if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) {
215            readability.addFootnotes(articleContent);
216        }
217
218        readability.fixImageFloats(articleContent);
219    },
220
221    /**
222     * Some content ends up looking ugly if the image is too large to be floated.
223     * If the image is wider than a threshold (currently 55%), no longer float it,
224     * center it instead.
225     *
226     * @param Element
227     * @return void
228    **/
229    fixImageFloats: function (articleContent) {
230        var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55,
231            images              = articleContent.getElementsByTagName('img');
232
233        for(var i=0, il = images.length; i < il; i+=1) {
234            var image = images[i];
235
236            if(image.offsetWidth > imageWidthThreshold) {
237                image.className += " blockImage";
238            }
239        }
240    },
241
242    /**
243     * Get the article tools Element that has buttons like reload, print.
244     *
245     * @return void
246     **/
247    getArticleTools: function () {
248        var articleTools = document.createElement("DIV");
249
250        articleTools.id        = "readTools";
251        articleTools.innerHTML =
252            "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
253            "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
254            "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
255
256        return articleTools;
257    },
258
259    /**
260     * retuns the suggested direction of the string
261     *
262     * @return "rtl" || "ltr"
263     **/
264    getSuggestedDirection: function(text) {
265        function sanitizeText() {
266            return text.replace(/@\w+/, "");
267        }
268
269        function countMatches(match) {
270            var matches = text.match(new RegExp(match, "g"));
271            return matches !== null ? matches.length : 0;
272        }
273
274        function isRTL() {
275            var count_heb =  countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
276            var count_arb =  countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
277
278            // if 20% of chars are Hebrew or Arbic then direction is rtl
279            return  (count_heb + count_arb) * 100 / text.length > 20;
280        }
281
282        text  = sanitizeText(text);
283        return isRTL() ? "rtl" : "ltr";
284    },
285
286    /**
287     * Get the article title as an H1.
288     *
289     * @return void
290     **/
291    getArticleTitle: function () {
292        var curTitle = "",
293            origTitle = "";
294
295        try {
296            curTitle = origTitle = document.title;
297            if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
298                curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
299            }
300        }
301        catch(e) {}
302
303        if(curTitle.match(/ [\|\-] /))
304        {
305            curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
306
307            if(curTitle.split(' ').length < 3) {
308                curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
309            }
310        }
311        else if(curTitle.indexOf(': ') !== -1)
312        {
313            curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
314
315            if(curTitle.split(' ').length < 3) {
316                curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
317            }
318        }
319        else if(curTitle.length > 150 || curTitle.length < 15)
320        {
321            var hOnes = document.getElementsByTagName('h1');
322            if(hOnes.length === 1)
323            {
324                curTitle = readability.getInnerText(hOnes[0]);
325            }
326        }
327
328        curTitle = curTitle.replace( readability.regexps.trim, "" );
329
330        if(curTitle.split(' ').length <= 4) {
331            curTitle = origTitle;
332        }
333        return curTitle;
334    },
335
336    /**
337     * Prepare the HTML document for readability to scrape it.
338     * This includes things like stripping javascript, CSS, and handling terrible markup.
339     *
340     * @return void
341     **/
342    prepDocument: function () {
343        /**
344         * In some cases a body element can't be found (if the HTML is totally hosed for example)
345         * so we create a new body node and append it to the document.
346         */
347        if(document.body === null)
348        {
349            var body = document.createElement("body");
350            try {
351                document.body = body;
352            }
353            catch(e) {
354                document.documentElement.appendChild(body);
355                dbg(e);
356            }
357        }
358
359        document.body.id = "readabilityBody";
360
361        var frames = document.getElementsByTagName('frame');
362        if(frames.length > 0)
363        {
364            var bestFrame = null;
365            var bestFrameSize = 0;    /* The frame to try to run readability upon. Must be on same domain. */
366            var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */
367            for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1)
368            {
369                var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
370                var canAccessFrame = false;
371                try {
372                    var frameBody = frames[frameIndex].contentWindow.document.body;
373                    canAccessFrame = true;
374                }
375                catch(eFrames) {
376                    dbg(eFrames);
377                }
378
379                if(frameSize > biggestFrameSize) {
380                    biggestFrameSize         = frameSize;
381                    readability.biggestFrame = frames[frameIndex];
382                }
383
384                if(canAccessFrame && frameSize > bestFrameSize)
385                {
386                    readability.frameHack = true;
387
388                    bestFrame = frames[frameIndex];
389                    bestFrameSize = frameSize;
390                }
391            }
392
393            if(bestFrame)
394            {
395                var newBody = document.createElement('body');
396                readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody);
397                newBody.style.overflow = 'scroll';
398                document.body = newBody;
399
400                var frameset = document.getElementsByTagName('frameset')[0];
401                if(frameset) {
402                    frameset.parentNode.removeChild(frameset); }
403            }
404        }
405
406        /* Remove all stylesheets */
407        for (var k=0;k < document.styleSheets.length; k+=1) {
408            if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) {
409                document.styleSheets[k].disabled = true;
410            }
411        }
412
413        /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
414        var styleTags = document.getElementsByTagName("style");
415        for (var st=0;st < styleTags.length; st+=1) {
416            styleTags[st].textContent = "";
417        }
418
419        /* Turn all double br's into p's */
420        /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
421        readability.replaceDoubleBrsWithPs(document.body);
422        readability.replaceFontsWithSpans(document.body);
423    },
424
425
426    /**
427     * Prepare the article node for display. Clean out any inline styles,
428     * iframes, forms, strip extraneous <p> tags, etc.
429     *
430     * @param Element
431     * @return void
432     **/
433    prepArticle: function (articleContent) {
434        readability.cleanStyles(articleContent);
435        readability.killBreaks(articleContent);
436
437        /* Clean out junk from the article content */
438        readability.cleanConditionally(articleContent, "form");
439        readability.clean(articleContent, "object");
440        readability.clean(articleContent, "h1");
441
442        /**
443         * If there is only one h2, they are probably using it
444         * as a header and not a subheader, so remove it since we already have a header.
445        ***/
446        if(articleContent.getElementsByTagName('h2').length === 1) {
447            readability.clean(articleContent, "h2");
448        }
449        readability.clean(articleContent, "iframe");
450
451        readability.cleanHeaders(articleContent);
452
453        /* Do these last as the previous stuff may have removed junk that will affect these */
454        readability.cleanConditionally(articleContent, "table");
455        readability.cleanConditionally(articleContent, "ul");
456        readability.cleanConditionally(articleContent, "div");
457
458        /* Remove extra paragraphs */
459        var articleParagraphs = articleContent.getElementsByTagName('p');
460        for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
461            var imgCount    = articleParagraphs[i].getElementsByTagName('img').length;
462            var embedCount  = articleParagraphs[i].getElementsByTagName('embed').length;
463            var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
464
465            if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
466                articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
467            }
468        }
469
470        try {
471            readability.replaceBrsWithPs(articleContent);
472        }
473        catch (e) {
474            dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
475        }
476    },
477
478    /**
479     * Initialize a node with the readability object. Also checks the
480     * className/id for special names to add to its score.
481     *
482     * @param Element
483     * @return void
484    **/
485    initializeNode: function (node) {
486        node.readability = {"contentScore": 0};
487
488        switch(node.tagName) {
489            case 'DIV':
490                node.readability.contentScore += 5;
491                break;
492
493            case 'PRE':
494            case 'TD':
495            case 'BLOCKQUOTE':
496                node.readability.contentScore += 3;
497                break;
498
499            case 'ADDRESS':
500            case 'OL':
501            case 'UL':
502            case 'DL':
503            case 'DD':
504            case 'DT':
505            case 'LI':
506            case 'FORM':
507                node.readability.contentScore -= 3;
508                break;
509
510            case 'H1':
511            case 'H2':
512            case 'H3':
513            case 'H4':
514            case 'H5':
515            case 'H6':
516            case 'TH':
517                node.readability.contentScore -= 5;
518                break;
519        }
520
521        node.readability.contentScore += readability.getClassWeight(node);
522    },
523
524    /***
525     * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
526     *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
527     *
528     * @param page a document to run upon. Needs to be a full document, complete with body.
529     * @return Element
530    **/
531    grabArticle: function (pageToClone) {
532        var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
533            isPaging = (page !== null) ? true: false;
534
535        var page = null;
536        // Never work on the actual page.
537        if (isPaging) {
538            page = document.body.cloneNode(true);
539        } else {
540            page = pageToClone.cloneNode(true);
541        }
542
543        var allElements = page.getElementsByTagName('*');
544
545        /**
546         * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
547         * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
548         *
549         * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
550         * TODO: Shouldn't this be a reverse traversal?
551        **/
552        var node = null;
553        var nodesToScore = [];
554        for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
555            /* Remove unlikely candidates */
556            if (stripUnlikelyCandidates) {
557                var unlikelyMatchString = node.className + node.id;
558                if (
559                    (
560                        unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
561                        unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
562                        node.tagName !== "BODY"
563                    )
564                )
565                {
566                    dbg("Removing unlikely candidate - " + unlikelyMatchString);
567                    node.parentNode.removeChild(node);
568                    nodeIndex-=1;
569                    continue;
570                }
571            }
572
573            if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
574                nodesToScore[nodesToScore.length] = node;
575            }
576
577            /* Turn all divs that don't have children block level elements into p's */
578            if (node.tagName === "DIV") {
579                if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
580                    var newNode = document.createElement('p');
581                    try {
582                        readability.moveNodeInnards(node, newNode);
583                        node.parentNode.replaceChild(newNode, node);
584                        nodeIndex-=1;
585
586                        nodesToScore[nodesToScore.length] = node;
587                    }
588                    catch(e) {
589                        dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
590                    }
591                }
592                else
593                {
594                    /* EXPERIMENTAL */
595                    for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
596                        var childNode = node.childNodes[i];
597                        if(childNode.nodeType === 3) { // Node.TEXT_NODE
598                            var p = document.createElement('p');
599                            var t = document.createTextNode(childNode.nodeValue);
600                            p.appendChild(t);
601                            p.style.display = 'inline';
602                            p.className = 'readability-styled';
603                            childNode.parentNode.replaceChild(p, childNode);
604                        }
605                    }
606                }
607            }
608        }
609
610        /**
611         * Loop through all paragraphs, and assign a score to them based on how content-y they look.
612         * Then add their score to their parent node.
613         *
614         * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
615        **/
616        var candidates = [];
617        for (var pt=0; pt < nodesToScore.length; pt+=1) {
618            var parentNode      = nodesToScore[pt].parentNode;
619            var grandParentNode = parentNode ? parentNode.parentNode : null;
620            var innerText       = readability.getInnerText(nodesToScore[pt]);
621
622            if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
623                continue;
624            }
625
626            /* If this paragraph is less than 25 characters, don't even count it. */
627            if(innerText.length < 25) {
628                continue; }
629
630            /* Initialize readability data for the parent. */
631            if(typeof parentNode.readability === 'undefined') {
632                readability.initializeNode(parentNode);
633                candidates.push(parentNode);
634            }
635
636            /* Initialize readability data for the grandparent. */
637            if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
638                readability.initializeNode(grandParentNode);
639                candidates.push(grandParentNode);
640            }
641
642            var contentScore = 0;
643
644            /* Add a point for the paragraph itself as a base. */
645            contentScore+=1;
646
647            /* Add points for any commas within this paragraph */
648            contentScore += innerText.split(',').length;
649
650            /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
651            contentScore += Math.min(Math.floor(innerText.length / 100), 3);
652
653            /* Add the score to the parent. The grandparent gets half. */
654            parentNode.readability.contentScore += contentScore;
655
656            if(grandParentNode) {
657                grandParentNode.readability.contentScore += contentScore/2;
658            }
659        }
660
661        /**
662         * After we've calculated scores, loop through all of the possible candidate nodes we found
663         * and find the one with the highest score.
664        **/
665        var topCandidate = null;
666        for(var c=0, cl=candidates.length; c < cl; c+=1)
667        {
668            /**
669             * Scale the final candidates score based on link density. Good content should have a
670             * relatively small link density (5% or less) and be mostly unaffected by this operation.
671            **/
672            candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
673
674            dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
675
676            if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
677                topCandidate = candidates[c]; }
678        }
679
680        /**
681         * If we still have no top candidate, just use the body as a last resort.
682         * We also have to copy the body node so it is something we can modify.
683         **/
684        if (topCandidate === null || topCandidate.tagName === "BODY")
685        {
686            topCandidate = document.createElement("DIV");
687            readability.replaceNodeInnards(page, topCandidate);
688            page.appendChild(topCandidate);
689            readability.initializeNode(topCandidate);
690        }
691
692        /**
693         * Now that we have the top candidate, look through its siblings for content that might also be related.
694         * Things like preambles, content split by ads that we removed, etc.
695        **/
696        var articleContent        = document.createElement("DIV");
697        if (isPaging) {
698            articleContent.id     = "readability-content";
699        }
700        var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
701        var siblingNodes          = topCandidate.parentNode.childNodes;
702
703
704        for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
705            var siblingNode = siblingNodes[s];
706            var append      = false;
707
708            /**
709             * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
710             * Example of error visible here: http://www.esquire.com/features/honesty0707
711            **/
712            if(!siblingNode) {
713                continue;
714            }
715
716            dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
717            dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
718
719            if(siblingNode === topCandidate)
720            {
721                append = true;
722            }
723
724            var contentBonus = 0;
725            /* Give a bonus if sibling nodes and top candidates have the example same classname */
726            if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
727                contentBonus += topCandidate.readability.contentScore * 0.2;
728            }
729
730            if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
731            {
732                append = true;
733            }
734
735            if(siblingNode.nodeName === "P") {
736                var linkDensity = readability.getLinkDensity(siblingNode);
737                var nodeContent = readability.getInnerText(siblingNode);
738                var nodeLength  = nodeContent.length;
739
740                if(nodeLength > 80 && linkDensity < 0.25)
741                {
742                    append = true;
743                }
744                else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
745                {
746                    append = true;
747                }
748            }
749
750            if(append) {
751                dbg("Appending node: " + siblingNode);
752
753                var nodeToAppend = null;
754                if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
755                    /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
756
757                    dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
758                    nodeToAppend = document.createElement("DIV");
759                    try {
760                        nodeToAppend.id = siblingNode.id;
761                        readability.moveNodeInnards(siblingNode, nodeToAppend);
762                    }
763                    catch(er) {
764                        dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
765                        nodeToAppend = siblingNode;
766                        s-=1;
767                        sl-=1;
768                    }
769                } else {
770                    nodeToAppend = siblingNode;
771                    s-=1;
772                    sl-=1;
773                }
774
775                /* To ensure a node does not interfere with readability styles, remove its classnames */
776                nodeToAppend.className = "";
777
778                /* Append sibling and subtract from our list because it removes the node when you append to another node */
779                articleContent.appendChild(nodeToAppend);
780            }
781        }
782
783        /**
784         * So we have all of the content that we need. Now we clean it up for presentation.
785        **/
786        readability.distilledArticleContent = articleContent.cloneNode(true);
787        //readability.prepArticle(articleContent);
788
789        if (readability.curPageNum === 1) {
790            var newNode = document.createElement('div');
791            newNode.id = "readability-page-1";
792            newNode.setAttribute("class", "page");
793            readability.moveNodeInnards(articleContent, newNode);
794            articleContent.appendChild(newNode);
795        }
796
797        /**
798         * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
799         * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
800         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
801         * finding the -right- content.
802        **/
803        if(readability.getInnerText(articleContent, false).length < 250) {
804            if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
805                readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
806                return readability.grabArticle(document.body);
807            }
808            else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
809                readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
810                return readability.grabArticle(document.body);
811            }
812            else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
813                readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
814                return readability.grabArticle(document.body);
815            } else {
816                return null;
817            }
818        }
819
820        return articleContent;
821    },
822
823    /**
824     * Removes script tags from the document.
825     *
826     * @param Element
827    **/
828    removeScripts: function (doc) {
829        var scripts = doc.getElementsByTagName('script');
830        for(var i = scripts.length-1; i >= 0; i-=1)
831        {
832            if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
833            {
834                scripts[i].nodeValue="";
835                scripts[i].removeAttribute('src');
836                if (scripts[i].parentNode) {
837                        scripts[i].parentNode.removeChild(scripts[i]);
838                }
839            }
840        }
841    },
842
843    /**
844     * Get the inner text of a node - cross browser compatibly.
845     * This also strips out any excess whitespace to be found.
846     *
847     * @param Element
848     * @return string
849    **/
850    getInnerText: function (e, normalizeSpaces) {
851        var textContent    = "";
852
853        if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") {
854            return "";
855        }
856
857        normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
858
859        if (navigator.appName === "Microsoft Internet Explorer") {
860            textContent = e.innerText.replace( readability.regexps.trim, "" ); }
861        else {
862            textContent = e.textContent.replace( readability.regexps.trim, "" ); }
863
864        if(normalizeSpaces) {
865            return textContent.replace( readability.regexps.normalize, " "); }
866        else {
867            return textContent; }
868    },
869
870    /**
871     * Get the number of times a string s appears in the node e.
872     *
873     * @param Element
874     * @param string - what to split on. Default is ","
875     * @return number (integer)
876    **/
877    getCharCount: function (e,s) {
878        s = s || ",";
879        return readability.getInnerText(e).split(s).length-1;
880    },
881
882    /**
883     * Remove the style attribute on every e and under.
884     * TODO: Test if getElementsByTagName(*) is faster.
885     *
886     * @param Element
887     * @return void
888    **/
889    cleanStyles: function (e) {
890        e = e || document;
891        var cur = e.firstChild;
892
893        if(!e) {
894            return; }
895
896        // Remove any root styles, if we're able.
897        if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') {
898            e.removeAttribute('style'); }
899
900        // Go until there are no more child nodes
901        while ( cur !== null ) {
902            if ( cur.nodeType === 1 ) {
903                // Remove style attribute(s) :
904                if(cur.className !== "readability-styled") {
905                    cur.removeAttribute("style");
906                }
907                readability.cleanStyles( cur );
908            }
909            cur = cur.nextSibling;
910        }
911    },
912
913    /**
914     * Get the density of links as a percentage of the content
915     * This is the amount of text that is inside a link divided by the total text in the node.
916     *
917     * @param Element
918     * @return number (float)
919    **/
920    getLinkDensity: function (e) {
921        var links      = e.getElementsByTagName("a");
922        var textLength = readability.getInnerText(e).length;
923        var linkLength = 0;
924        for(var i=0, il=links.length; i<il;i+=1)
925        {
926            linkLength += readability.getInnerText(links[i]).length;
927        }
928
929        return linkLength / textLength;
930    },
931
932    /**
933     * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
934     *
935     * @author Dan Lacy
936     * @return string the base url
937    **/
938    findBaseUrl: function () {
939        var noUrlParams     = window.location.pathname.split("?")[0],
940            urlSlashes      = noUrlParams.split("/").reverse(),
941            cleanedSegments = [],
942            possibleType    = "";
943
944        for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
945            var segment = urlSlashes[i];
946
947            // Split off and save anything that looks like a file type.
948            if (segment.indexOf(".") !== -1) {
949                possibleType = segment.split(".")[1];
950
951                /* If the type isn't alpha-only, it's probably not actually a file extension. */
952                if(!possibleType.match(/[^a-zA-Z]/)) {
953                    segment = segment.split(".")[0];
954                }
955            }
956
957            /**
958             * EW-CMS specific segment replacement. Ugly.
959             * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
960            **/
961            if(segment.indexOf(',00') !== -1) {
962                segment = segment.replace(',00', '');
963            }
964
965            // If our first or second segment has anything looking like a page number, remove it.
966            if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
967                segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
968            }
969
970
971            var del = false;
972
973            /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
974            if (i < 2 && segment.match(/^\d{1,2}$/)) {
975                del = true;
976            }
977
978            /* If this is the first segment and it's just "index", remove it. */
979            if(i === 0 && segment.toLowerCase() === "index") {
980                del = true;
981            }
982
983
984            /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
985            if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
986                del = true;
987            }
988
989            /* If it's not marked for deletion, push it to cleanedSegments. */
990            if (!del) {
991                cleanedSegments.push(segment);
992            }
993        }
994
995        // This is our final, cleaned, base article URL.
996        return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/");
997    },
998
999    /**
1000     * Look for any paging links that may occur within the document.
1001     *
1002     * @param body
1003     * @return object (array)
1004    **/
1005    findNextPageLink: function (elem) {
1006        var possiblePages = {},
1007            allLinks = elem.getElementsByTagName('a'),
1008            articleBaseUrl = readability.findBaseUrl();
1009
1010        /**
1011         * Loop through all links, looking for hints that they may be next-page links.
1012         * Things like having "page" in their textContent, className or id, or being a child
1013         * of a node with a page-y className or id.
1014         *
1015         * Also possible: levenshtein distance? longest common subsequence?
1016         *
1017         * After we do that, assign each page a score, and
1018        **/
1019        for(var i = 0, il = allLinks.length; i < il; i+=1) {
1020            var link     = allLinks[i],
1021                linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
1022
1023            /* If we've already seen this page, ignore it */
1024            if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {
1025                continue;
1026            }
1027
1028            /* If it's on a different domain, skip it. */
1029            if(window.location.host !== linkHref.split(/\/+/g)[1]) {
1030                continue;
1031            }
1032
1033            var linkText = readability.getInnerText(link);
1034
1035            /* If the linkText looks like it's not the next page, skip it. */
1036            if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
1037                continue;
1038            }
1039
1040            /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
1041            var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
1042            if(!linkHrefLeftover.match(/\d/)) {
1043                continue;
1044            }
1045
1046            if(!(linkHref in possiblePages)) {
1047                possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
1048            } else {
1049                possiblePages[linkHref].linkText += ' | ' + linkText;
1050            }
1051
1052            var linkObj = possiblePages[linkHref];
1053
1054            /**
1055             * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
1056             * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
1057            **/
1058            if(linkHref.indexOf(articleBaseUrl) !== 0) {
1059                linkObj.score -= 25;
1060            }
1061
1062            var linkData = linkText + ' ' + link.className + ' ' + link.id;
1063            if(linkData.match(readability.regexps.nextLink)) {
1064                linkObj.score += 50;
1065            }
1066            if(linkData.match(/pag(e|ing|inat)/i)) {
1067                linkObj.score += 25;
1068            }
1069            if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
1070                /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
1071                if(!linkObj.linkText.match(readability.regexps.nextLink)) {
1072                    linkObj.score -= 65;
1073                }
1074            }
1075            if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) {
1076                linkObj.score -= 50;
1077            }
1078            if(linkData.match(readability.regexps.prevLink)) {
1079                linkObj.score -= 200;
1080            }
1081
1082            /* If a parentNode contains page or paging or paginat */
1083            var parentNode = link.parentNode,
1084                positiveNodeMatch = false,
1085                negativeNodeMatch = false;
1086            while(parentNode) {
1087                var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
1088                if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
1089                    positiveNodeMatch = true;
1090                    linkObj.score += 25;
1091                }
1092                if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) {
1093                    /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
1094                    if(!parentNodeClassAndId.match(readability.regexps.positive)) {
1095                        linkObj.score -= 25;
1096                        negativeNodeMatch = true;
1097                    }
1098                }
1099
1100                parentNode = parentNode.parentNode;
1101            }
1102
1103            /**
1104             * If the URL looks like it has paging in it, add to the score.
1105             * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
1106            **/
1107            if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
1108                linkObj.score += 25;
1109            }
1110
1111            /* If the URL contains negative values, give a slight decrease. */
1112            if (linkHref.match(readability.regexps.extraneous)) {
1113                linkObj.score -= 15;
1114            }
1115
1116            /**
1117             * Minor punishment to anything that doesn't match our current URL.
1118             * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
1119             *       Dan, can you show me a counterexample where this is necessary?
1120             * if (linkHref.indexOf(window.location.href) !== 0) {
1121             *    linkObj.score -= 1;
1122             * }
1123            **/
1124
1125            /**
1126             * If the link text can be parsed as a number, give it a minor bonus, with a slight
1127             * bias towards lower numbered pages. This is so that pages that might not have 'next'
1128             * in their text can still get scored, and sorted properly by score.
1129            **/
1130            var linkTextAsNumber = parseInt(linkText, 10);
1131            if(linkTextAsNumber) {
1132                // Punish 1 since we're either already there, or it's probably before what we want anyways.
1133                if (linkTextAsNumber === 1) {
1134                    linkObj.score -= 10;
1135                }
1136                else {
1137                    // Todo: Describe this better
1138                    linkObj.score += Math.max(0, 10 - linkTextAsNumber);
1139                }
1140            }
1141        }
1142
1143        /**
1144         * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL.
1145         * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
1146        **/
1147        var topPage = null;
1148        for(var page in possiblePages) {
1149            if(possiblePages.hasOwnProperty(page)) {
1150                if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) {
1151                    topPage = possiblePages[page];
1152                }
1153            }
1154        }
1155
1156        if(topPage) {
1157            var nextHref = topPage.href.replace(/\/$/,'');
1158
1159            dbg('NEXT PAGE IS ' + nextHref);
1160            readability.parsedPages[nextHref] = true;
1161            return nextHref;
1162        }
1163        else {
1164            return null;
1165        }
1166    },
1167
1168    createLinkDiv: function(link) {
1169        var divNode = document.createElement('div');
1170        var aNode = document.createElement('a');
1171        var tNode = document.createTextNode('View Next Page');
1172        divNode.setAttribute('style', 'text-align: center');
1173        aNode.setAttribute('href', link);
1174        aNode.appendChild(tNode);
1175        divNode.appendChild(aNode);
1176        return divNode;
1177    },
1178
1179    xhr: function () {
1180        if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
1181            return new XMLHttpRequest();
1182        }
1183        else {
1184            try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { }
1185            try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { }
1186            try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { }
1187        }
1188
1189        return false;
1190    },
1191
1192    successfulRequest: function (request) {
1193        return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText);
1194    },
1195
1196    ajax: function (url, options) {
1197        var request = readability.xhr();
1198
1199        function respondToReadyState(readyState) {
1200            if (request.readyState === 4) {
1201                if (readability.successfulRequest(request)) {
1202                    if (options.success) { options.success(request); }
1203                }
1204                else {
1205                    if (options.error) { options.error(request); }
1206                }
1207            }
1208        }
1209
1210        if (typeof options === 'undefined') { options = {}; }
1211
1212        request.onreadystatechange = respondToReadyState;
1213
1214        request.open('get', url, true);
1215        request.setRequestHeader('Accept', 'text/html');
1216
1217        try {
1218            request.send(options.postBody);
1219        }
1220        catch (e) {
1221            if (options.error) { options.error(); }
1222        }
1223
1224        return request;
1225    },
1226
1227    /**
1228     * Make an AJAX request for each page and append it to the document.
1229    **/
1230    curPageNum: 1,
1231
1232    appendNextPage: function (nextPageLink) {
1233        readability.curPageNum+=1;
1234
1235        var articlePage       = document.createElement("DIV");
1236        articlePage.id        = 'readability-page-' + readability.curPageNum;
1237        articlePage.className = 'page';
1238        articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">&sect;</p>';
1239
1240        document.getElementById("readability-content").appendChild(articlePage);
1241
1242        if(readability.curPageNum > readability.maxPages) {
1243            var linkDiv = readability.createLinkDiv(nextPageLink);
1244
1245            articlePage.appendChild(linkDiv);
1246            return;
1247        }
1248
1249        /**
1250         * Now that we've built the article page DOM element, get the page content
1251         * asynchronously and load the cleaned content into the div we created for it.
1252        **/
1253        (function(pageUrl, thisPage) {
1254            readability.ajax(pageUrl, {
1255                success: function(r) {
1256
1257                    /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
1258                    var eTag = r.getResponseHeader('ETag');
1259                    if(eTag) {
1260                        if(eTag in readability.pageETags) {
1261                            dbg("Exact duplicate page found via ETag. Aborting.");
1262                            articlePage.style.display = 'none';
1263                            return;
1264                        } else {
1265                            readability.pageETags[eTag] = 1;
1266                        }
1267                    }
1268
1269                    // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
1270                    var page = document.createElement("DIV");
1271
1272                    /**
1273                     * Do some preprocessing to our HTML to make it ready for appending.
1274                     * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
1275                     * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript.
1276                     * • Turn all double br's into p's - was handled by prepDocument in the original view.
1277                     *   Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages.
1278                    **/
1279                    var pageInnards = r.responseXML;
1280                    readability.removeScripts(pageInnards);
1281                    readability.replaceNoscriptsWithPs(pageInnards);
1282                    readability.replaceDoubleBrsWithPs(pageInnards);
1283                    readability.replaceFontsWithSpans(pageInnards);
1284                    page.appendChild(pageInnards);
1285
1286
1287                    /**
1288                     * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle.
1289                    **/
1290                    readability.flags = 0x1 | 0x2 | 0x4;
1291
1292                    var nextPageLink = readability.findNextPageLink(page),
1293                        content      =  readability.grabArticle(page);
1294
1295                    if(!content) {
1296                        dbg("No content found in page to append. Aborting.");
1297                        return;
1298                    }
1299
1300                    /**
1301                     * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
1302                     * Compare it against all of the the previous document's we've gotten. If the previous
1303                     * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
1304                    **/
1305                    var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
1306                    if(firstP && firstP.innerHTML.length > 100) {
1307                        for(var i=1; i <= readability.curPageNum; i+=1) {
1308                            var rPage = document.getElementById('readability-page-' + i);
1309                            if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
1310                                dbg('Duplicate of page ' + i + ' - skipping.');
1311                                articlePage.style.display = 'none';
1312                                readability.parsedPages[pageUrl] = true;
1313                                return;
1314                            }
1315                        }
1316                    }
1317
1318                    readability.removeScripts(content);
1319
1320                    readability.moveNodeInnards(content, thisPage);
1321
1322                    /**
1323                     * After the page has rendered, post process the content. This delay is necessary because,
1324                     * in webkit at least, offsetWidth is not set in time to determine image width. We have to
1325                     * wait a little bit for reflow to finish before we can fix floating images.
1326                    **/
1327                    window.setTimeout(
1328                        function() { readability.postProcessContent(thisPage); },
1329                        500
1330                    );
1331
1332                    if(nextPageLink) {
1333                        readability.appendNextPage(nextPageLink);
1334                    }
1335                }
1336            });
1337        }(nextPageLink, articlePage));
1338    },
1339
1340    /**
1341     * Get an elements class/id weight. Uses regular expressions to tell if this
1342     * element looks good or bad.
1343     *
1344     * @param Element
1345     * @return number (Integer)
1346    **/
1347    getClassWeight: function (e) {
1348        if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
1349            return 0;
1350        }
1351
1352        var weight = 0;
1353
1354        /* Look for a special classname */
1355        if (typeof(e.className) === 'string' && e.className !== '')
1356        {
1357            if(e.className.search(readability.regexps.negative) !== -1) {
1358                weight -= 25; }
1359
1360            if(e.className.search(readability.regexps.positive) !== -1) {
1361                weight += 25; }
1362        }
1363
1364        /* Look for a special ID */
1365        if (typeof(e.id) === 'string' && e.id !== '')
1366        {
1367            if(e.id.search(readability.regexps.negative) !== -1) {
1368                weight -= 25; }
1369
1370            if(e.id.search(readability.regexps.positive) !== -1) {
1371                weight += 25; }
1372        }
1373
1374        return weight;
1375    },
1376
1377    nodeIsVisible: function (node) {
1378        return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none';
1379    },
1380
1381    /**
1382     * Remove extraneous break tags from a node.
1383     *
1384     * @param Element
1385     * @return void
1386     **/
1387    killBreaks: function (e) {
1388        var allElements = e.getElementsByTagName('*');
1389        while (i < allElements.length) {
1390            readability.deleteExtraBreaks(allElements[i]);
1391            i++;
1392        }
1393    },
1394
1395    /**
1396     * Clean a node of all elements of type "tag".
1397     * (Unless it's a youtube/vimeo video. People love movies.)
1398     *
1399     * @param Element
1400     * @param string tag to clean
1401     * @return void
1402     **/
1403    clean: function (e, tag) {
1404        var targetList = e.getElementsByTagName( tag );
1405        var isEmbed    = (tag === 'object' || tag === 'embed');
1406
1407        for (var y=targetList.length-1; y >= 0; y-=1) {
1408            /* Allow youtube and vimeo videos through as people usually want to see those. */
1409            if(isEmbed) {
1410                var attributeValues = "";
1411                for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
1412                    attributeValues += targetList[y].attributes[i].value + '|';
1413                }
1414
1415                /* First, check the elements attributes to see if any of them contain youtube or vimeo */
1416                if (attributeValues.search(readability.regexps.videos) !== -1) {
1417                    continue;
1418                }
1419
1420                /* Then check the elements inside this element for the same. */
1421                if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
1422                    continue;
1423                }
1424
1425            }
1426
1427            targetList[y].parentNode.removeChild(targetList[y]);
1428        }
1429    },
1430
1431    /**
1432     * Clean an element of all tags of type "tag" if they look fishy.
1433     * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
1434     *
1435     * @return void
1436     **/
1437    cleanConditionally: function (e, tag) {
1438
1439        if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
1440            return;
1441        }
1442
1443        var tagsList      = e.getElementsByTagName(tag);
1444        var curTagsLength = tagsList.length;
1445
1446        /**
1447         * Gather counts for other typical elements embedded within.
1448         * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1449         *
1450         * TODO: Consider taking into account original contentScore here.
1451        **/
1452        for (var i=curTagsLength-1; i >= 0; i-=1) {
1453            var weight = readability.getClassWeight(tagsList[i]);
1454            var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
1455
1456            dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
1457
1458            if(weight+contentScore < 0)
1459            {
1460                tagsList[i].parentNode.removeChild(tagsList[i]);
1461            }
1462            else if ( readability.getCharCount(tagsList[i],',') < 10) {
1463                /**
1464                 * If there are not very many commas, and the number of
1465                 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1466                **/
1467                var p      = tagsList[i].getElementsByTagName("p").length;
1468                var img    = tagsList[i].getElementsByTagName("img").length;
1469                var li     = tagsList[i].getElementsByTagName("li").length-100;
1470                var input  = tagsList[i].getElementsByTagName("input").length;
1471
1472                var embedCount = 0;
1473                var embeds     = tagsList[i].getElementsByTagName("embed");
1474                for(var ei=0,il=embeds.length; ei < il; ei+=1) {
1475                    if (embeds[ei].src.search(readability.regexps.videos) === -1) {
1476                      embedCount+=1;
1477                    }
1478                }
1479
1480                var linkDensity   = readability.getLinkDensity(tagsList[i]);
1481                var contentLength = readability.getInnerText(tagsList[i]).length;
1482                var toRemove      = false;
1483
1484                if ( img > p ) {
1485                    toRemove = true;
1486                } else if(li > p && tag !== "ul" && tag !== "ol") {
1487                    toRemove = true;
1488                } else if( input > Math.floor(p/3) ) {
1489                    toRemove = true;
1490                } else if(contentLength < 25 && (img === 0 || img > 2) ) {
1491                    toRemove = true;
1492                } else if(weight < 25 && linkDensity > 0.2) {
1493                    toRemove = true;
1494                } else if(weight >= 25 && linkDensity > 0.5) {
1495                    toRemove = true;
1496                } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
1497                    toRemove = true;
1498                }
1499
1500                if(toRemove) {
1501                    tagsList[i].parentNode.removeChild(tagsList[i]);
1502                }
1503            }
1504        }
1505    },
1506
1507    /**
1508     * Clean out spurious headers from an Element. Checks things like classnames and link density.
1509     *
1510     * @param Element
1511     * @return void
1512    **/
1513    cleanHeaders: function (e) {
1514        for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) {
1515            var headers = e.getElementsByTagName('h' + headerIndex);
1516            for (var i=headers.length-1; i >=0; i-=1) {
1517                if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
1518                    headers[i].parentNode.removeChild(headers[i]);
1519                }
1520            }
1521        }
1522    },
1523
1524    flagIsActive: function(flag) {
1525        return (readability.flags & flag) > 0;
1526    },
1527
1528    addFlag: function(flag) {
1529        readability.flags = readability.flags | flag;
1530    },
1531
1532    removeFlag: function(flag) {
1533        readability.flags = readability.flags & ~flag;
1534    },
1535
1536    // Removes the children of |src| and appends them to |dest|.
1537    moveNodeInnards: function(src, dest) {
1538        try {
1539            while (src.firstChild) {
1540                dest.appendChild(src.removeChild(src.firstChild));
1541            }
1542        } catch (e) {}
1543    },
1544
1545    // Returns true if the node is a whitespace text node.
1546    isWhitespaceNode: function(node) {
1547        if (node.nodeType == Node.TEXT_NODE) {
1548            if (node.data.trim().length == 0) {
1549               return true;
1550            }
1551        }
1552        return false;
1553    },
1554
1555    // Returns true if the node is a <BR>.
1556    isBrNode: function(node) {
1557        return (node.tagName === 'BR');
1558    },
1559
1560
1561    // Returns the last <BR> node in a sequence of <BR> nodes that are only
1562    // separated by whitespace, or null if there are not at least two <BR> tags
1563    // in the sibling chain starting with |node|. Returns the second such <BR>
1564    // node if |restrictToTwo| is true.
1565    isMultipleBr: function(node, restrictToTwo) {
1566        var lastBr = null;
1567        if (!readability.isBrNode(node)) {
1568            return lastBr;
1569        }
1570        var curr = node.nextSibling;
1571        while (curr) {
1572            if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) {
1573                lastBr = curr;
1574                curr = curr.nextSibling;
1575                if (restrictToTwo) {
1576                    if (readability.isBrNode(lastBr)) {
1577                        return lastBr;
1578                    }
1579                }
1580                continue;
1581            }
1582            break;
1583        }
1584        return lastBr;
1585    },
1586
1587    // Removes all <BR> nodes except one and whitespace in between in a series
1588    // of <BR> nodes.
1589    deleteExtraBreaks: function(node) {
1590        var lastBr = readability.isMultipleBr(node, false);
1591        var ret = false;
1592        while (lastBr && lastBr != node) {
1593            var toRemove = lastBr;
1594            lastBr = lastBr.previousSibling;
1595            toRemove.parentNode.removeChild(toRemove);
1596            ret = true;
1597        }
1598        return ret;
1599    },
1600
1601    // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
1602    // <P> node, and makes all next siblings of that pair children of <P>, up
1603    // until the next pair of <BR> nodes is reached.
1604    replaceDoubleBrWithP: function(node) {
1605        // Check that we are starting with a BR.
1606        var second = readability.isMultipleBr(node, true);
1607        if (!second) {
1608            return;
1609        }
1610        // Make all next siblings of the second BR into children of a P.
1611        var p = document.createElement('p');
1612        var curr = second.nextSibling;
1613        while (curr) {
1614            if (readability.isMultipleBr(curr, true)) {
1615                break;
1616            }
1617            var next = curr.nextSibling;
1618            p.appendChild(curr.parentNode.removeChild(curr));
1619            curr = next;
1620        }
1621        var ret = curr;
1622
1623        // Remove all nodes between the first and second BR.
1624        curr = node.nextSibling;
1625        while (curr && curr != second) {
1626            var next = curr.nextSibling;
1627            curr.parentNode.removeChild(curr);
1628            curr = next;
1629        }
1630        // Remove the second BR.
1631        second.parentNode.removeChild(second);
1632        // Replace the first BR with the P.
1633        node.parentNode.replaceChild(p, node);
1634
1635        return ret;
1636    },
1637
1638    // Returns true if the NodeList contains a double <BR>.
1639    hasDoubleBr: function(nodeList) {
1640        for (var i = 0; i < nodeList.length; nodeList++) {
1641            if (readability.isMultipleBr(nodeList[i], true)) {
1642                return true;
1643            }
1644        }
1645        return false;
1646    },
1647
1648    // Replaces double <BR> tags with <P> tags.
1649    replaceDoubleBrsWithPs: function(node) {
1650        var allElements = node.getElementsByTagName('BR');
1651        var node = null;
1652        while (allElements && allElements.length > 0 &&
1653               readability.hasDoubleBr(allElements)) {
1654            for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
1655                var next = node;
1656                while (next = readability.replaceDoubleBrWithP(next));
1657            }
1658            allElements = document.body.getElementsByTagName('BR');
1659        }
1660    },
1661
1662
1663    // Replaces a BR and the whitespace that follows it with a P.
1664    replaceBrWithP: function(node) {
1665        if (!readability.isBrNode(node)) {
1666            return;
1667        }
1668        var p = document.createElement('p');
1669        var curr = node.nextSibling;
1670        while (curr && !isBrNode(curr)) {
1671            var next = curr.nextSibling;
1672            if (readability.isWhitespaceNode(curr)) {
1673                curr.parentNode.removeChild(curr);
1674            } else {
1675                p.appendChild(curr.parentNode.removeChild(curr));
1676            }
1677            curr = next;
1678        }
1679        node.parentNode.replaceChild(p, node);
1680        return curr;
1681    },
1682
1683    // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag
1684    // children of the <P>.
1685    replaceBrsWithPs: function(node) {
1686        var allElements = node.getElementsByTagName('BR');
1687        var node = null;
1688        while (allElements && allElements.length > 0) {
1689            for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
1690                var next = node;
1691                while (next = readability.replaceBrWithP(next));
1692            }
1693            allElements = document.body.getElementsByTagName('BR');
1694        }
1695    },
1696
1697    // Replaces any tag with any other tag.
1698    replaceTagsWithTags: function(node, srcTag, destTag) {
1699        var allElements = node.getElementsByTagName(srcTag);
1700        for (var i = 0; i < allElements.length; i++) {
1701            var dest = document.createElement(destTag);
1702            readability.moveNodeInnards(allElements[i], dest);
1703            allElements[i].parentNode.replaceChild(dest, allElements[i]);
1704        }
1705    },
1706
1707    // Replaces all <noscript> tags with <p> tags.
1708    replaceNoscriptsWithPs: function(node) {
1709        readability.replaceTagsWithTags(node, 'noscript', 'p');
1710    },
1711
1712    // Replaces all <font> tags with <span> tags.
1713    replaceFontsWithSpans: function(node) {
1714        readability.replaceTagsWithTags(node, 'font', 'span');
1715    },
1716
1717    // Returns a list of image URLs in the distilled article.
1718    getImages : function() {
1719        var images = document.getElementsByTagName('img');
1720        var result = new Array(images.length);
1721        dbg("Number of images: " + images.length);
1722        for(i = 0; i < images.length; i++) {
1723            result[i] = images[i].src;
1724            dbg("Image: " + result[i]);
1725        }
1726        return result;
1727    },
1728
1729    // Returns the distilled article HTML from the page(s).
1730    getDistilledArticleHTML : function() {
1731        return readability.distilledHTML;
1732    },
1733
1734    // Returns the next page of this article.
1735    getNextPageLink : function() {
1736        return readability.nextPageLink;
1737    }
1738};
1739