// Copyright 2014 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Local modifications to this file are described in the README.chromium // file. var dbg = (typeof console !== 'undefined') ? function(s) { console.log("Readability: " + s); } : function() {}; /* * Readability. An Arc90 Lab Experiment. * Website: http://lab.arc90.com/experiments/readability * Source: http://code.google.com/p/arc90labs-readability * * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission. * * Copyright (c) 2010 Arc90 Inc * Readability is licensed under the Apache License, Version 2.0. **/ var readability = { readStyle: "style-newspaper", readSize: "size-medium", readMargin: "margin-wide", distilledHTML: '', distilledArticleContent: null, nextPageLink: '', version: '1.7.1', iframeLoads: 0, convertLinksToFootnotes: false, reversePageScroll: false, /* If they hold shift and hit space, scroll up */ frameHack: false, /** * The frame hack is to workaround a firefox bug where if you * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. * So we fake a scrollbar in the wrapping div. **/ biggestFrame: false, flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ /* constants */ FLAG_STRIP_UNLIKELYS: 0x1, FLAG_WEIGHT_CLASSES: 0x2, FLAG_CLEAN_CONDITIONALLY: 0x4, maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ /** * All of the regular expressions in use within readability. * Defined up here so we don't instantiate them repeatedly in loops. **/ regexps: { unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, okMaybeItsACandidate: /and|article|body|column|main|shadow/i, positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i, divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, replaceBrs: /(]*>[ \n\r\t]*){2,}/gi, replaceFonts: /<(\/?)font[^>]*>/gi, trim: /^\s+|\s+$/g, normalize: /\s{2,}/g, killBreaks: /((\s| ?)*){1,}/g, videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. prevLink: /(prev|earl|old|new|<|«)/i }, /** * Runs readability. * * Workflow: * 1. Prep the document by removing script tags, css, etc. * 2. Build readability's DOM tree. * 3. Grab the article content from the current dom tree. * 4. Replace the current DOM tree with the new one. * 5. Read peacefully. * * @return void **/ init: function() { /* Before we do anything, remove all scripts that are not readability. */ window.onload = window.onunload = function() {}; readability.removeScripts(document); /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */ readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; /* Pull out any possible next page link first */ readability.nextPageLink = readability.findNextPageLink(document.body); /* We handle processing of nextPage from C++ set nextPageLink to null */ var nextPageLink = null; readability.prepDocument(); /* Build readability's DOM tree */ var overlay = document.createElement("DIV"); var innerDiv = document.createElement("DIV"); var articleTools = readability.getArticleTools(); var articleTitleText = readability.getArticleTitle(); var articleContent = readability.grabArticle(); if(!articleContent) { articleContent = document.createElement("DIV"); articleContent.id = "readability-content"; articleContent.innerHTML = [ "

Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please let us know by submitting an issue.

", (readability.frameHack ? "

It appears this page uses frames. Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""), "

Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.

" ].join(''); nextPageLink = null; } overlay.id = "readOverlay"; innerDiv.id = "readInner"; /* Apply user-selected styling */ document.body.className = readability.readStyle; document.dir = readability.getSuggestedDirection(articleTitleText); if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){ overlay.className = readability.readStyle + " rdbTypekit"; } else { overlay.className = readability.readStyle; } innerDiv.className = readability.readMargin + " " + readability.readSize; if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) { readability.convertLinksToFootnotes = true; } readability.distilledHTML = articleContent.innerHTML; if(readability.frameHack) { var readOverlay = document.getElementById('readOverlay'); readOverlay.style.height = '100%'; readOverlay.style.overflow = 'auto'; } /** * If someone tries to use Readability on a site's root page, give them a warning about usage. **/ if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) { articleContent.style.display = "none"; var rootWarning = document.createElement('p'); rootWarning.id = "readability-warning"; rootWarning.innerHTML = "Readability was intended for use on individual articles and not home pages. " + "If you'd like to try rendering this page anyway, click here to continue."; innerDiv.insertBefore( rootWarning, articleContent ); } readability.postProcessContent(articleContent); window.scrollTo(0, 0); if (nextPageLink) { /** * Append any additional pages after a small timeout so that people * can start reading without having to wait for this to finish processing. **/ window.setTimeout(function() { readability.appendNextPage(nextPageLink); }, 500); } /** Smooth scrolling **/ document.onkeydown = function(e) { var code = (window.event) ? event.keyCode : e.keyCode; if (code === 16) { readability.reversePageScroll = true; return; } if (code === 32) { readability.curScrollStep = 0; var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight); if(readability.reversePageScroll) { readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); } else { readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); } return false; } }; document.onkeyup = function(e) { var code = (window.event) ? event.keyCode : e.keyCode; if (code === 16) { readability.reversePageScroll = false; return; } }; }, /** * Run any post-process modifications to article content as necessary. * * @param Element * @return void **/ postProcessContent: function(articleContent) { if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) { readability.addFootnotes(articleContent); } readability.fixImageFloats(articleContent); }, /** * Some content ends up looking ugly if the image is too large to be floated. * If the image is wider than a threshold (currently 55%), no longer float it, * center it instead. * * @param Element * @return void **/ fixImageFloats: function (articleContent) { var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55, images = articleContent.getElementsByTagName('img'); for(var i=0, il = images.length; i < il; i+=1) { var image = images[i]; if(image.offsetWidth > imageWidthThreshold) { image.className += " blockImage"; } } }, /** * Get the article tools Element that has buttons like reload, print. * * @return void **/ getArticleTools: function () { var articleTools = document.createElement("DIV"); articleTools.id = "readTools"; articleTools.innerHTML = "Reload Original Page" + "Print Page" + "Email Page"; return articleTools; }, /** * retuns the suggested direction of the string * * @return "rtl" || "ltr" **/ getSuggestedDirection: function(text) { function sanitizeText() { return text.replace(/@\w+/, ""); } function countMatches(match) { var matches = text.match(new RegExp(match, "g")); return matches !== null ? matches.length : 0; } function isRTL() { var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); // if 20% of chars are Hebrew or Arbic then direction is rtl return (count_heb + count_arb) * 100 / text.length > 20; } text = sanitizeText(text); return isRTL() ? "rtl" : "ltr"; }, /** * Get the article title as an H1. * * @return void **/ getArticleTitle: function () { var curTitle = "", origTitle = ""; try { curTitle = origTitle = document.title; if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); } } catch(e) {} if(curTitle.match(/ [\|\-] /)) { curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); if(curTitle.split(' ').length < 3) { curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); } } else if(curTitle.indexOf(': ') !== -1) { curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); if(curTitle.split(' ').length < 3) { curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); } } else if(curTitle.length > 150 || curTitle.length < 15) { var hOnes = document.getElementsByTagName('h1'); if(hOnes.length === 1) { curTitle = readability.getInnerText(hOnes[0]); } } curTitle = curTitle.replace( readability.regexps.trim, "" ); if(curTitle.split(' ').length <= 4) { curTitle = origTitle; } return curTitle; }, /** * Prepare the HTML document for readability to scrape it. * This includes things like stripping javascript, CSS, and handling terrible markup. * * @return void **/ prepDocument: function () { /** * In some cases a body element can't be found (if the HTML is totally hosed for example) * so we create a new body node and append it to the document. */ if(document.body === null) { var body = document.createElement("body"); try { document.body = body; } catch(e) { document.documentElement.appendChild(body); dbg(e); } } document.body.id = "readabilityBody"; var frames = document.getElementsByTagName('frame'); if(frames.length > 0) { var bestFrame = null; var bestFrameSize = 0; /* The frame to try to run readability upon. Must be on same domain. */ var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */ for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1) { var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight; var canAccessFrame = false; try { var frameBody = frames[frameIndex].contentWindow.document.body; canAccessFrame = true; } catch(eFrames) { dbg(eFrames); } if(frameSize > biggestFrameSize) { biggestFrameSize = frameSize; readability.biggestFrame = frames[frameIndex]; } if(canAccessFrame && frameSize > bestFrameSize) { readability.frameHack = true; bestFrame = frames[frameIndex]; bestFrameSize = frameSize; } } if(bestFrame) { var newBody = document.createElement('body'); readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody); newBody.style.overflow = 'scroll'; document.body = newBody; var frameset = document.getElementsByTagName('frameset')[0]; if(frameset) { frameset.parentNode.removeChild(frameset); } } } /* Remove all stylesheets */ for (var k=0;k < document.styleSheets.length; k+=1) { if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) { document.styleSheets[k].disabled = true; } } /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */ var styleTags = document.getElementsByTagName("style"); for (var st=0;st < styleTags.length; st+=1) { styleTags[st].textContent = ""; } /* Turn all double br's into p's */ /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ readability.replaceDoubleBrsWithPs(document.body); readability.replaceFontsWithSpans(document.body); }, /** * Prepare the article node for display. Clean out any inline styles, * iframes, forms, strip extraneous

tags, etc. * * @param Element * @return void **/ prepArticle: function (articleContent) { readability.cleanStyles(articleContent); readability.killBreaks(articleContent); /* Clean out junk from the article content */ readability.cleanConditionally(articleContent, "form"); readability.clean(articleContent, "object"); readability.clean(articleContent, "h1"); /** * If there is only one h2, they are probably using it * as a header and not a subheader, so remove it since we already have a header. ***/ if(articleContent.getElementsByTagName('h2').length === 1) { readability.clean(articleContent, "h2"); } readability.clean(articleContent, "iframe"); readability.cleanHeaders(articleContent); /* Do these last as the previous stuff may have removed junk that will affect these */ readability.cleanConditionally(articleContent, "table"); readability.cleanConditionally(articleContent, "ul"); readability.cleanConditionally(articleContent, "div"); /* Remove extra paragraphs */ var articleParagraphs = articleContent.getElementsByTagName('p'); for(var i = articleParagraphs.length-1; i >= 0; i-=1) { var imgCount = articleParagraphs[i].getElementsByTagName('img').length; var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; var objectCount = articleParagraphs[i].getElementsByTagName('object').length; if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') { articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); } } try { readability.replaceBrsWithPs(articleContent); } catch (e) { dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e); } }, /** * Initialize a node with the readability object. Also checks the * className/id for special names to add to its score. * * @param Element * @return void **/ initializeNode: function (node) { node.readability = {"contentScore": 0}; switch(node.tagName) { case 'DIV': node.readability.contentScore += 5; break; case 'PRE': case 'TD': case 'BLOCKQUOTE': node.readability.contentScore += 3; break; case 'ADDRESS': case 'OL': case 'UL': case 'DL': case 'DD': case 'DT': case 'LI': case 'FORM': node.readability.contentScore -= 3; break; case 'H1': case 'H2': case 'H3': case 'H4': case 'H5': case 'H6': case 'TH': node.readability.contentScore -= 5; break; } node.readability.contentScore += readability.getClassWeight(node); }, /*** * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. * * @param page a document to run upon. Needs to be a full document, complete with body. * @return Element **/ grabArticle: function (pageToClone) { var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS), isPaging = (page !== null) ? true: false; var page = null; // Never work on the actual page. if (isPaging) { page = document.body.cloneNode(true); } else { page = pageToClone.cloneNode(true); } var allElements = page.getElementsByTagName('*'); /** * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) * * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 * TODO: Shouldn't this be a reverse traversal? **/ var node = null; var nodesToScore = []; for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { /* Remove unlikely candidates */ if (stripUnlikelyCandidates) { var unlikelyMatchString = node.className + node.id; if ( ( unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 && unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 && node.tagName !== "BODY" ) ) { dbg("Removing unlikely candidate - " + unlikelyMatchString); node.parentNode.removeChild(node); nodeIndex-=1; continue; } } if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") { nodesToScore[nodesToScore.length] = node; } /* Turn all divs that don't have children block level elements into p's */ if (node.tagName === "DIV") { if (node.innerHTML.search(readability.regexps.divToPElements) === -1) { var newNode = document.createElement('p'); try { readability.moveNodeInnards(node, newNode); node.parentNode.replaceChild(newNode, node); nodeIndex-=1; nodesToScore[nodesToScore.length] = node; } catch(e) { dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e); } } else { /* EXPERIMENTAL */ for(var i = 0, il = node.childNodes.length; i < il; i+=1) { var childNode = node.childNodes[i]; if(childNode.nodeType === 3) { // Node.TEXT_NODE var p = document.createElement('p'); var t = document.createTextNode(childNode.nodeValue); p.appendChild(t); p.style.display = 'inline'; p.className = 'readability-styled'; childNode.parentNode.replaceChild(p, childNode); } } } } } /** * Loop through all paragraphs, and assign a score to them based on how content-y they look. * Then add their score to their parent node. * * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. **/ var candidates = []; for (var pt=0; pt < nodesToScore.length; pt+=1) { var parentNode = nodesToScore[pt].parentNode; var grandParentNode = parentNode ? parentNode.parentNode : null; var innerText = readability.getInnerText(nodesToScore[pt]); if(!parentNode || typeof(parentNode.tagName) === 'undefined') { continue; } /* If this paragraph is less than 25 characters, don't even count it. */ if(innerText.length < 25) { continue; } /* Initialize readability data for the parent. */ if(typeof parentNode.readability === 'undefined') { readability.initializeNode(parentNode); candidates.push(parentNode); } /* Initialize readability data for the grandparent. */ if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') { readability.initializeNode(grandParentNode); candidates.push(grandParentNode); } var contentScore = 0; /* Add a point for the paragraph itself as a base. */ contentScore+=1; /* Add points for any commas within this paragraph */ contentScore += innerText.split(',').length; /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ contentScore += Math.min(Math.floor(innerText.length / 100), 3); /* Add the score to the parent. The grandparent gets half. */ parentNode.readability.contentScore += contentScore; if(grandParentNode) { grandParentNode.readability.contentScore += contentScore/2; } } /** * After we've calculated scores, loop through all of the possible candidate nodes we found * and find the one with the highest score. **/ var topCandidate = null; for(var c=0, cl=candidates.length; c < cl; c+=1) { /** * Scale the final candidates score based on link density. Good content should have a * relatively small link density (5% or less) and be mostly unaffected by this operation. **/ candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c])); dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore); if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) { topCandidate = candidates[c]; } } /** * If we still have no top candidate, just use the body as a last resort. * We also have to copy the body node so it is something we can modify. **/ if (topCandidate === null || topCandidate.tagName === "BODY") { topCandidate = document.createElement("DIV"); readability.replaceNodeInnards(page, topCandidate); page.appendChild(topCandidate); readability.initializeNode(topCandidate); } /** * Now that we have the top candidate, look through its siblings for content that might also be related. * Things like preambles, content split by ads that we removed, etc. **/ var articleContent = document.createElement("DIV"); if (isPaging) { articleContent.id = "readability-content"; } var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); var siblingNodes = topCandidate.parentNode.childNodes; for(var s=0, sl=siblingNodes.length; s < sl; s+=1) { var siblingNode = siblingNodes[s]; var append = false; /** * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList. * Example of error visible here: http://www.esquire.com/features/honesty0707 **/ if(!siblingNode) { continue; } dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); if(siblingNode === topCandidate) { append = true; } var contentBonus = 0; /* Give a bonus if sibling nodes and top candidates have the example same classname */ if(siblingNode.className === topCandidate.className && topCandidate.className !== "") { contentBonus += topCandidate.readability.contentScore * 0.2; } if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) { append = true; } if(siblingNode.nodeName === "P") { var linkDensity = readability.getLinkDensity(siblingNode); var nodeContent = readability.getInnerText(siblingNode); var nodeLength = nodeContent.length; if(nodeLength > 80 && linkDensity < 0.25) { append = true; } else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) { append = true; } } if(append) { dbg("Appending node: " + siblingNode); var nodeToAppend = null; if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); nodeToAppend = document.createElement("DIV"); try { nodeToAppend.id = siblingNode.id; readability.moveNodeInnards(siblingNode, nodeToAppend); } catch(er) { dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); nodeToAppend = siblingNode; s-=1; sl-=1; } } else { nodeToAppend = siblingNode; s-=1; sl-=1; } /* To ensure a node does not interfere with readability styles, remove its classnames */ nodeToAppend.className = ""; /* Append sibling and subtract from our list because it removes the node when you append to another node */ articleContent.appendChild(nodeToAppend); } } /** * So we have all of the content that we need. Now we clean it up for presentation. **/ readability.distilledArticleContent = articleContent.cloneNode(true); //readability.prepArticle(articleContent); if (readability.curPageNum === 1) { var newNode = document.createElement('div'); newNode.id = "readability-page-1"; newNode.setAttribute("class", "page"); readability.moveNodeInnards(articleContent, newNode); articleContent.appendChild(newNode); } /** * Now that we've gone through the full algorithm, check to see if we got any meaningful content. * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher * likelihood of finding the content, and the sieve approach gives us a higher likelihood of * finding the -right- content. **/ if(readability.getInnerText(articleContent, false).length < 250) { if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); return readability.grabArticle(document.body); } else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); return readability.grabArticle(document.body); } else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); return readability.grabArticle(document.body); } else { return null; } } return articleContent; }, /** * Removes script tags from the document. * * @param Element **/ removeScripts: function (doc) { var scripts = doc.getElementsByTagName('script'); for(var i = scripts.length-1; i >= 0; i-=1) { if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) { scripts[i].nodeValue=""; scripts[i].removeAttribute('src'); if (scripts[i].parentNode) { scripts[i].parentNode.removeChild(scripts[i]); } } } }, /** * Get the inner text of a node - cross browser compatibly. * This also strips out any excess whitespace to be found. * * @param Element * @return string **/ getInnerText: function (e, normalizeSpaces) { var textContent = ""; if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") { return ""; } normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; if (navigator.appName === "Microsoft Internet Explorer") { textContent = e.innerText.replace( readability.regexps.trim, "" ); } else { textContent = e.textContent.replace( readability.regexps.trim, "" ); } if(normalizeSpaces) { return textContent.replace( readability.regexps.normalize, " "); } else { return textContent; } }, /** * Get the number of times a string s appears in the node e. * * @param Element * @param string - what to split on. Default is "," * @return number (integer) **/ getCharCount: function (e,s) { s = s || ","; return readability.getInnerText(e).split(s).length-1; }, /** * Remove the style attribute on every e and under. * TODO: Test if getElementsByTagName(*) is faster. * * @param Element * @return void **/ cleanStyles: function (e) { e = e || document; var cur = e.firstChild; if(!e) { return; } // Remove any root styles, if we're able. if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') { e.removeAttribute('style'); } // Go until there are no more child nodes while ( cur !== null ) { if ( cur.nodeType === 1 ) { // Remove style attribute(s) : if(cur.className !== "readability-styled") { cur.removeAttribute("style"); } readability.cleanStyles( cur ); } cur = cur.nextSibling; } }, /** * Get the density of links as a percentage of the content * This is the amount of text that is inside a link divided by the total text in the node. * * @param Element * @return number (float) **/ getLinkDensity: function (e) { var links = e.getElementsByTagName("a"); var textLength = readability.getInnerText(e).length; var linkLength = 0; for(var i=0, il=links.length; i 25) { continue; } /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */ var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); if(!linkHrefLeftover.match(/\d/)) { continue; } if(!(linkHref in possiblePages)) { possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; } else { possiblePages[linkHref].linkText += ' | ' + linkText; } var linkObj = possiblePages[linkHref]; /** * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html **/ if(linkHref.indexOf(articleBaseUrl) !== 0) { linkObj.score -= 25; } var linkData = linkText + ' ' + link.className + ' ' + link.id; if(linkData.match(readability.regexps.nextLink)) { linkObj.score += 50; } if(linkData.match(/pag(e|ing|inat)/i)) { linkObj.score += 25; } if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ if(!linkObj.linkText.match(readability.regexps.nextLink)) { linkObj.score -= 65; } } if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) { linkObj.score -= 50; } if(linkData.match(readability.regexps.prevLink)) { linkObj.score -= 200; } /* If a parentNode contains page or paging or paginat */ var parentNode = link.parentNode, positiveNodeMatch = false, negativeNodeMatch = false; while(parentNode) { var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { positiveNodeMatch = true; linkObj.score += 25; } if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) { /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */ if(!parentNodeClassAndId.match(readability.regexps.positive)) { linkObj.score -= 25; negativeNodeMatch = true; } } parentNode = parentNode.parentNode; } /** * If the URL looks like it has paging in it, add to the score. * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 **/ if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { linkObj.score += 25; } /* If the URL contains negative values, give a slight decrease. */ if (linkHref.match(readability.regexps.extraneous)) { linkObj.score -= 15; } /** * Minor punishment to anything that doesn't match our current URL. * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. * Dan, can you show me a counterexample where this is necessary? * if (linkHref.indexOf(window.location.href) !== 0) { * linkObj.score -= 1; * } **/ /** * If the link text can be parsed as a number, give it a minor bonus, with a slight * bias towards lower numbered pages. This is so that pages that might not have 'next' * in their text can still get scored, and sorted properly by score. **/ var linkTextAsNumber = parseInt(linkText, 10); if(linkTextAsNumber) { // Punish 1 since we're either already there, or it's probably before what we want anyways. if (linkTextAsNumber === 1) { linkObj.score -= 10; } else { // Todo: Describe this better linkObj.score += Math.max(0, 10 - linkTextAsNumber); } } } /** * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL. * Require at least a score of 50, which is a relatively high confidence that this page is the next link. **/ var topPage = null; for(var page in possiblePages) { if(possiblePages.hasOwnProperty(page)) { if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) { topPage = possiblePages[page]; } } } if(topPage) { var nextHref = topPage.href.replace(/\/$/,''); dbg('NEXT PAGE IS ' + nextHref); readability.parsedPages[nextHref] = true; return nextHref; } else { return null; } }, createLinkDiv: function(link) { var divNode = document.createElement('div'); var aNode = document.createElement('a'); var tNode = document.createTextNode('View Next Page'); divNode.setAttribute('style', 'text-align: center'); aNode.setAttribute('href', link); aNode.appendChild(tNode); divNode.appendChild(aNode); return divNode; }, xhr: function () { if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { return new XMLHttpRequest(); } else { try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { } try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { } try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { } } return false; }, successfulRequest: function (request) { return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText); }, ajax: function (url, options) { var request = readability.xhr(); function respondToReadyState(readyState) { if (request.readyState === 4) { if (readability.successfulRequest(request)) { if (options.success) { options.success(request); } } else { if (options.error) { options.error(request); } } } } if (typeof options === 'undefined') { options = {}; } request.onreadystatechange = respondToReadyState; request.open('get', url, true); request.setRequestHeader('Accept', 'text/html'); try { request.send(options.postBody); } catch (e) { if (options.error) { options.error(); } } return request; }, /** * Make an AJAX request for each page and append it to the document. **/ curPageNum: 1, appendNextPage: function (nextPageLink) { readability.curPageNum+=1; var articlePage = document.createElement("DIV"); articlePage.id = 'readability-page-' + readability.curPageNum; articlePage.className = 'page'; articlePage.innerHTML = '

§

'; document.getElementById("readability-content").appendChild(articlePage); if(readability.curPageNum > readability.maxPages) { var linkDiv = readability.createLinkDiv(nextPageLink); articlePage.appendChild(linkDiv); return; } /** * Now that we've built the article page DOM element, get the page content * asynchronously and load the cleaned content into the div we created for it. **/ (function(pageUrl, thisPage) { readability.ajax(pageUrl, { success: function(r) { /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ var eTag = r.getResponseHeader('ETag'); if(eTag) { if(eTag in readability.pageETags) { dbg("Exact duplicate page found via ETag. Aborting."); articlePage.style.display = 'none'; return; } else { readability.pageETags[eTag] = 1; } } // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. var page = document.createElement("DIV"); /** * Do some preprocessing to our HTML to make it ready for appending. * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript. * • Turn all double br's into p's - was handled by prepDocument in the original view. * Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages. **/ var pageInnards = r.responseXML; readability.removeScripts(pageInnards); readability.replaceNoscriptsWithPs(pageInnards); readability.replaceDoubleBrsWithPs(pageInnards); readability.replaceFontsWithSpans(pageInnards); page.appendChild(pageInnards); /** * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle. **/ readability.flags = 0x1 | 0x2 | 0x4; var nextPageLink = readability.findNextPageLink(page), content = readability.grabArticle(page); if(!content) { dbg("No content found in page to append. Aborting."); return; } /** * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. * Compare it against all of the the previous document's we've gotten. If the previous * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. **/ var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; if(firstP && firstP.innerHTML.length > 100) { for(var i=1; i <= readability.curPageNum; i+=1) { var rPage = document.getElementById('readability-page-' + i); if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { dbg('Duplicate of page ' + i + ' - skipping.'); articlePage.style.display = 'none'; readability.parsedPages[pageUrl] = true; return; } } } readability.removeScripts(content); readability.moveNodeInnards(content, thisPage); /** * After the page has rendered, post process the content. This delay is necessary because, * in webkit at least, offsetWidth is not set in time to determine image width. We have to * wait a little bit for reflow to finish before we can fix floating images. **/ window.setTimeout( function() { readability.postProcessContent(thisPage); }, 500 ); if(nextPageLink) { readability.appendNextPage(nextPageLink); } } }); }(nextPageLink, articlePage)); }, /** * Get an elements class/id weight. Uses regular expressions to tell if this * element looks good or bad. * * @param Element * @return number (Integer) **/ getClassWeight: function (e) { if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { return 0; } var weight = 0; /* Look for a special classname */ if (typeof(e.className) === 'string' && e.className !== '') { if(e.className.search(readability.regexps.negative) !== -1) { weight -= 25; } if(e.className.search(readability.regexps.positive) !== -1) { weight += 25; } } /* Look for a special ID */ if (typeof(e.id) === 'string' && e.id !== '') { if(e.id.search(readability.regexps.negative) !== -1) { weight -= 25; } if(e.id.search(readability.regexps.positive) !== -1) { weight += 25; } } return weight; }, nodeIsVisible: function (node) { return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none'; }, /** * Remove extraneous break tags from a node. * * @param Element * @return void **/ killBreaks: function (e) { var allElements = e.getElementsByTagName('*'); while (i < allElements.length) { readability.deleteExtraBreaks(allElements[i]); i++; } }, /** * Clean a node of all elements of type "tag". * (Unless it's a youtube/vimeo video. People love movies.) * * @param Element * @param string tag to clean * @return void **/ clean: function (e, tag) { var targetList = e.getElementsByTagName( tag ); var isEmbed = (tag === 'object' || tag === 'embed'); for (var y=targetList.length-1; y >= 0; y-=1) { /* Allow youtube and vimeo videos through as people usually want to see those. */ if(isEmbed) { var attributeValues = ""; for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { attributeValues += targetList[y].attributes[i].value + '|'; } /* First, check the elements attributes to see if any of them contain youtube or vimeo */ if (attributeValues.search(readability.regexps.videos) !== -1) { continue; } /* Then check the elements inside this element for the same. */ if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) { continue; } } targetList[y].parentNode.removeChild(targetList[y]); } }, /** * Clean an element of all tags of type "tag" if they look fishy. * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. * * @return void **/ cleanConditionally: function (e, tag) { if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { return; } var tagsList = e.getElementsByTagName(tag); var curTagsLength = tagsList.length; /** * Gather counts for other typical elements embedded within. * Traverse backwards so we can remove nodes at the same time without effecting the traversal. * * TODO: Consider taking into account original contentScore here. **/ for (var i=curTagsLength-1; i >= 0; i-=1) { var weight = readability.getClassWeight(tagsList[i]); var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : '')); if(weight+contentScore < 0) { tagsList[i].parentNode.removeChild(tagsList[i]); } else if ( readability.getCharCount(tagsList[i],',') < 10) { /** * If there are not very many commas, and the number of * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. **/ var p = tagsList[i].getElementsByTagName("p").length; var img = tagsList[i].getElementsByTagName("img").length; var li = tagsList[i].getElementsByTagName("li").length-100; var input = tagsList[i].getElementsByTagName("input").length; var embedCount = 0; var embeds = tagsList[i].getElementsByTagName("embed"); for(var ei=0,il=embeds.length; ei < il; ei+=1) { if (embeds[ei].src.search(readability.regexps.videos) === -1) { embedCount+=1; } } var linkDensity = readability.getLinkDensity(tagsList[i]); var contentLength = readability.getInnerText(tagsList[i]).length; var toRemove = false; if ( img > p ) { toRemove = true; } else if(li > p && tag !== "ul" && tag !== "ol") { toRemove = true; } else if( input > Math.floor(p/3) ) { toRemove = true; } else if(contentLength < 25 && (img === 0 || img > 2) ) { toRemove = true; } else if(weight < 25 && linkDensity > 0.2) { toRemove = true; } else if(weight >= 25 && linkDensity > 0.5) { toRemove = true; } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { toRemove = true; } if(toRemove) { tagsList[i].parentNode.removeChild(tagsList[i]); } } } }, /** * Clean out spurious headers from an Element. Checks things like classnames and link density. * * @param Element * @return void **/ cleanHeaders: function (e) { for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) { var headers = e.getElementsByTagName('h' + headerIndex); for (var i=headers.length-1; i >=0; i-=1) { if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) { headers[i].parentNode.removeChild(headers[i]); } } } }, flagIsActive: function(flag) { return (readability.flags & flag) > 0; }, addFlag: function(flag) { readability.flags = readability.flags | flag; }, removeFlag: function(flag) { readability.flags = readability.flags & ~flag; }, // Removes the children of |src| and appends them to |dest|. moveNodeInnards: function(src, dest) { try { while (src.firstChild) { dest.appendChild(src.removeChild(src.firstChild)); } } catch (e) {} }, // Returns true if the node is a whitespace text node. isWhitespaceNode: function(node) { if (node.nodeType == Node.TEXT_NODE) { if (node.data.trim().length == 0) { return true; } } return false; }, // Returns true if the node is a
. isBrNode: function(node) { return (node.tagName === 'BR'); }, // Returns the last
node in a sequence of
nodes that are only // separated by whitespace, or null if there are not at least two
tags // in the sibling chain starting with |node|. Returns the second such
// node if |restrictToTwo| is true. isMultipleBr: function(node, restrictToTwo) { var lastBr = null; if (!readability.isBrNode(node)) { return lastBr; } var curr = node.nextSibling; while (curr) { if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) { lastBr = curr; curr = curr.nextSibling; if (restrictToTwo) { if (readability.isBrNode(lastBr)) { return lastBr; } } continue; } break; } return lastBr; }, // Removes all
nodes except one and whitespace in between in a series // of
nodes. deleteExtraBreaks: function(node) { var lastBr = readability.isMultipleBr(node, false); var ret = false; while (lastBr && lastBr != node) { var toRemove = lastBr; lastBr = lastBr.previousSibling; toRemove.parentNode.removeChild(toRemove); ret = true; } return ret; }, // Replaces a pair of
nodes (possibly separated by whitespace), with a //

node, and makes all next siblings of that pair children of

, up // until the next pair of
nodes is reached. replaceDoubleBrWithP: function(node) { // Check that we are starting with a BR. var second = readability.isMultipleBr(node, true); if (!second) { return; } // Make all next siblings of the second BR into children of a P. var p = document.createElement('p'); var curr = second.nextSibling; while (curr) { if (readability.isMultipleBr(curr, true)) { break; } var next = curr.nextSibling; p.appendChild(curr.parentNode.removeChild(curr)); curr = next; } var ret = curr; // Remove all nodes between the first and second BR. curr = node.nextSibling; while (curr && curr != second) { var next = curr.nextSibling; curr.parentNode.removeChild(curr); curr = next; } // Remove the second BR. second.parentNode.removeChild(second); // Replace the first BR with the P. node.parentNode.replaceChild(p, node); return ret; }, // Returns true if the NodeList contains a double
. hasDoubleBr: function(nodeList) { for (var i = 0; i < nodeList.length; nodeList++) { if (readability.isMultipleBr(nodeList[i], true)) { return true; } } return false; }, // Replaces double
tags with

tags. replaceDoubleBrsWithPs: function(node) { var allElements = node.getElementsByTagName('BR'); var node = null; while (allElements && allElements.length > 0 && readability.hasDoubleBr(allElements)) { for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) { var next = node; while (next = readability.replaceDoubleBrWithP(next)); } allElements = document.body.getElementsByTagName('BR'); } }, // Replaces a BR and the whitespace that follows it with a P. replaceBrWithP: function(node) { if (!readability.isBrNode(node)) { return; } var p = document.createElement('p'); var curr = node.nextSibling; while (curr && !isBrNode(curr)) { var next = curr.nextSibling; if (readability.isWhitespaceNode(curr)) { curr.parentNode.removeChild(curr); } else { p.appendChild(curr.parentNode.removeChild(curr)); } curr = next; } node.parentNode.replaceChild(p, node); return curr; }, // Replaces all
tags with

tags. Makes all next siblings of a
tag // children of the

. replaceBrsWithPs: function(node) { var allElements = node.getElementsByTagName('BR'); var node = null; while (allElements && allElements.length > 0) { for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) { var next = node; while (next = readability.replaceBrWithP(next)); } allElements = document.body.getElementsByTagName('BR'); } }, // Replaces any tag with any other tag. replaceTagsWithTags: function(node, srcTag, destTag) { var allElements = node.getElementsByTagName(srcTag); for (var i = 0; i < allElements.length; i++) { var dest = document.createElement(destTag); readability.moveNodeInnards(allElements[i], dest); allElements[i].parentNode.replaceChild(dest, allElements[i]); } }, // Replaces all

tags. replaceNoscriptsWithPs: function(node) { readability.replaceTagsWithTags(node, 'noscript', 'p'); }, // Replaces all tags with tags. replaceFontsWithSpans: function(node) { readability.replaceTagsWithTags(node, 'font', 'span'); }, // Returns a list of image URLs in the distilled article. getImages : function() { var images = document.getElementsByTagName('img'); var result = new Array(images.length); dbg("Number of images: " + images.length); for(i = 0; i < images.length; i++) { result[i] = images[i].src; dbg("Image: " + result[i]); } return result; }, // Returns the distilled article HTML from the page(s). getDistilledArticleHTML : function() { return readability.distilledHTML; }, // Returns the next page of this article. getNextPageLink : function() { return readability.nextPageLink; } };