// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Local modifications to this file are described in the README.chromium
// file.
var dbg = (typeof console !== 'undefined') ? function(s) {
console.log("Readability: " + s);
} : function() {};
/*
* Readability. An Arc90 Lab Experiment.
* Website: http://lab.arc90.com/experiments/readability
* Source: http://code.google.com/p/arc90labs-readability
*
* "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
*
* Copyright (c) 2010 Arc90 Inc
* Readability is licensed under the Apache License, Version 2.0.
**/
var readability = {
readStyle: "style-newspaper",
readSize: "size-medium",
readMargin: "margin-wide",
distilledHTML: '',
distilledArticleContent: null,
nextPageLink: '',
version: '1.7.1',
iframeLoads: 0,
convertLinksToFootnotes: false,
reversePageScroll: false, /* If they hold shift and hit space, scroll up */
frameHack: false, /**
* The frame hack is to workaround a firefox bug where if you
* pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
* So we fake a scrollbar in the wrapping div.
**/
biggestFrame: false,
flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */
/* constants */
FLAG_STRIP_UNLIKELYS: 0x1,
FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4,
maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
/**
* All of the regular expressions in use within readability.
* Defined up here so we don't instantiate them repeatedly in loops.
**/
regexps: {
unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replaceBrs: /(
]*>[ \n\r\t]*){2,}/gi,
replaceFonts: /<(\/?)font[^>]*>/gi,
trim: /^\s+|\s+$/g,
normalize: /\s{2,}/g,
killBreaks: /(
(\s| ?)*){1,}/g,
videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
prevLink: /(prev|earl|old|new|<|«)/i
},
/**
* Runs readability.
*
* Workflow:
* 1. Prep the document by removing script tags, css, etc.
* 2. Build readability's DOM tree.
* 3. Grab the article content from the current dom tree.
* 4. Replace the current DOM tree with the new one.
* 5. Read peacefully.
*
* @return void
**/
init: function() {
/* Before we do anything, remove all scripts that are not readability. */
window.onload = window.onunload = function() {};
readability.removeScripts(document);
/* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
/* Pull out any possible next page link first */
readability.nextPageLink = readability.findNextPageLink(document.body);
/* We handle processing of nextPage from C++ set nextPageLink to null */
var nextPageLink = null;
readability.prepDocument();
/* Build readability's DOM tree */
var overlay = document.createElement("DIV");
var innerDiv = document.createElement("DIV");
var articleTools = readability.getArticleTools();
var articleTitleText = readability.getArticleTitle();
var articleContent = readability.grabArticle();
if(!articleContent) {
articleContent = document.createElement("DIV");
articleContent.id = "readability-content";
articleContent.innerHTML = [
"
Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please let us know by submitting an issue.
", (readability.frameHack ? "It appears this page uses frames. Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""), "
Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.
" ].join(''); nextPageLink = null; } overlay.id = "readOverlay"; innerDiv.id = "readInner"; /* Apply user-selected styling */ document.body.className = readability.readStyle; document.dir = readability.getSuggestedDirection(articleTitleText); if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){ overlay.className = readability.readStyle + " rdbTypekit"; } else { overlay.className = readability.readStyle; } innerDiv.className = readability.readMargin + " " + readability.readSize; if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) { readability.convertLinksToFootnotes = true; } readability.distilledHTML = articleContent.innerHTML; if(readability.frameHack) { var readOverlay = document.getElementById('readOverlay'); readOverlay.style.height = '100%'; readOverlay.style.overflow = 'auto'; } /** * If someone tries to use Readability on a site's root page, give them a warning about usage. **/ if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) { articleContent.style.display = "none"; var rootWarning = document.createElement('p'); rootWarning.id = "readability-warning"; rootWarning.innerHTML = "Readability was intended for use on individual articles and not home pages. " + "If you'd like to try rendering this page anyway, click here to continue."; innerDiv.insertBefore( rootWarning, articleContent ); } readability.postProcessContent(articleContent); window.scrollTo(0, 0); if (nextPageLink) { /** * Append any additional pages after a small timeout so that people * can start reading without having to wait for this to finish processing. **/ window.setTimeout(function() { readability.appendNextPage(nextPageLink); }, 500); } /** Smooth scrolling **/ document.onkeydown = function(e) { var code = (window.event) ? event.keyCode : e.keyCode; if (code === 16) { readability.reversePageScroll = true; return; } if (code === 32) { readability.curScrollStep = 0; var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight); if(readability.reversePageScroll) { readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); } else { readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); } return false; } }; document.onkeyup = function(e) { var code = (window.event) ? event.keyCode : e.keyCode; if (code === 16) { readability.reversePageScroll = false; return; } }; }, /** * Run any post-process modifications to article content as necessary. * * @param Element * @return void **/ postProcessContent: function(articleContent) { if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) { readability.addFootnotes(articleContent); } readability.fixImageFloats(articleContent); }, /** * Some content ends up looking ugly if the image is too large to be floated. * If the image is wider than a threshold (currently 55%), no longer float it, * center it instead. * * @param Element * @return void **/ fixImageFloats: function (articleContent) { var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55, images = articleContent.getElementsByTagName('img'); for(var i=0, il = images.length; i < il; i+=1) { var image = images[i]; if(image.offsetWidth > imageWidthThreshold) { image.className += " blockImage"; } } }, /** * Get the article tools Element that has buttons like reload, print. * * @return void **/ getArticleTools: function () { var articleTools = document.createElement("DIV"); articleTools.id = "readTools"; articleTools.innerHTML = "Reload Original Page" + "Print Page" + "Email Page"; return articleTools; }, /** * retuns the suggested direction of the string * * @return "rtl" || "ltr" **/ getSuggestedDirection: function(text) { function sanitizeText() { return text.replace(/@\w+/, ""); } function countMatches(match) { var matches = text.match(new RegExp(match, "g")); return matches !== null ? matches.length : 0; } function isRTL() { var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); // if 20% of chars are Hebrew or Arbic then direction is rtl return (count_heb + count_arb) * 100 / text.length > 20; } text = sanitizeText(text); return isRTL() ? "rtl" : "ltr"; }, /** * Get the article title as an H1. * * @return void **/ getArticleTitle: function () { var curTitle = "", origTitle = ""; try { curTitle = origTitle = document.title; if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); } } catch(e) {} if(curTitle.match(/ [\|\-] /)) { curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); if(curTitle.split(' ').length < 3) { curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); } } else if(curTitle.indexOf(': ') !== -1) { curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); if(curTitle.split(' ').length < 3) { curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); } } else if(curTitle.length > 150 || curTitle.length < 15) { var hOnes = document.getElementsByTagName('h1'); if(hOnes.length === 1) { curTitle = readability.getInnerText(hOnes[0]); } } curTitle = curTitle.replace( readability.regexps.trim, "" ); if(curTitle.split(' ').length <= 4) { curTitle = origTitle; } return curTitle; }, /** * Prepare the HTML document for readability to scrape it. * This includes things like stripping javascript, CSS, and handling terrible markup. * * @return void **/ prepDocument: function () { /** * In some cases a body element can't be found (if the HTML is totally hosed for example) * so we create a new body node and append it to the document. */ if(document.body === null) { var body = document.createElement("body"); try { document.body = body; } catch(e) { document.documentElement.appendChild(body); dbg(e); } } document.body.id = "readabilityBody"; var frames = document.getElementsByTagName('frame'); if(frames.length > 0) { var bestFrame = null; var bestFrameSize = 0; /* The frame to try to run readability upon. Must be on same domain. */ var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */ for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1) { var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight; var canAccessFrame = false; try { var frameBody = frames[frameIndex].contentWindow.document.body; canAccessFrame = true; } catch(eFrames) { dbg(eFrames); } if(frameSize > biggestFrameSize) { biggestFrameSize = frameSize; readability.biggestFrame = frames[frameIndex]; } if(canAccessFrame && frameSize > bestFrameSize) { readability.frameHack = true; bestFrame = frames[frameIndex]; bestFrameSize = frameSize; } } if(bestFrame) { var newBody = document.createElement('body'); readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody); newBody.style.overflow = 'scroll'; document.body = newBody; var frameset = document.getElementsByTagName('frameset')[0]; if(frameset) { frameset.parentNode.removeChild(frameset); } } } /* Remove all stylesheets */ for (var k=0;k < document.styleSheets.length; k+=1) { if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) { document.styleSheets[k].disabled = true; } } /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */ var styleTags = document.getElementsByTagName("style"); for (var st=0;st < styleTags.length; st+=1) { styleTags[st].textContent = ""; } /* Turn all double br's into p's */ /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ readability.replaceDoubleBrsWithPs(document.body); readability.replaceFontsWithSpans(document.body); }, /** * Prepare the article node for display. Clean out any inline styles, * iframes, forms, strip extraneous tags, etc.
*
* @param Element
* @return void
**/
prepArticle: function (articleContent) {
readability.cleanStyles(articleContent);
readability.killBreaks(articleContent);
/* Clean out junk from the article content */
readability.cleanConditionally(articleContent, "form");
readability.clean(articleContent, "object");
readability.clean(articleContent, "h1");
/**
* If there is only one h2, they are probably using it
* as a header and not a subheader, so remove it since we already have a header.
***/
if(articleContent.getElementsByTagName('h2').length === 1) {
readability.clean(articleContent, "h2");
}
readability.clean(articleContent, "iframe");
readability.cleanHeaders(articleContent);
/* Do these last as the previous stuff may have removed junk that will affect these */
readability.cleanConditionally(articleContent, "table");
readability.cleanConditionally(articleContent, "ul");
readability.cleanConditionally(articleContent, "div");
/* Remove extra paragraphs */
var articleParagraphs = articleContent.getElementsByTagName('p');
for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
}
}
try {
readability.replaceBrsWithPs(articleContent);
}
catch (e) {
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
}
},
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
*
* @param Element
* @return void
**/
initializeNode: function (node) {
node.readability = {"contentScore": 0};
switch(node.tagName) {
case 'DIV':
node.readability.contentScore += 5;
break;
case 'PRE':
case 'TD':
case 'BLOCKQUOTE':
node.readability.contentScore += 3;
break;
case 'ADDRESS':
case 'OL':
case 'UL':
case 'DL':
case 'DD':
case 'DT':
case 'LI':
case 'FORM':
node.readability.contentScore -= 3;
break;
case 'H1':
case 'H2':
case 'H3':
case 'H4':
case 'H5':
case 'H6':
case 'TH':
node.readability.contentScore -= 5;
break;
}
node.readability.contentScore += readability.getClassWeight(node);
},
/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
* @param page a document to run upon. Needs to be a full document, complete with body.
* @return Element
**/
grabArticle: function (pageToClone) {
var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
isPaging = (page !== null) ? true: false;
var page = null;
// Never work on the actual page.
if (isPaging) {
page = document.body.cloneNode(true);
} else {
page = pageToClone.cloneNode(true);
}
var allElements = page.getElementsByTagName('*');
/**
* First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
* into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
*
* Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
* TODO: Shouldn't this be a reverse traversal?
**/
var node = null;
var nodesToScore = [];
for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
/* Remove unlikely candidates */
if (stripUnlikelyCandidates) {
var unlikelyMatchString = node.className + node.id;
if (
(
unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
node.tagName !== "BODY"
)
)
{
dbg("Removing unlikely candidate - " + unlikelyMatchString);
node.parentNode.removeChild(node);
nodeIndex-=1;
continue;
}
}
if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
nodesToScore[nodesToScore.length] = node;
}
/* Turn all divs that don't have children block level elements into p's */
if (node.tagName === "DIV") {
if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
var newNode = document.createElement('p');
try {
readability.moveNodeInnards(node, newNode);
node.parentNode.replaceChild(newNode, node);
nodeIndex-=1;
nodesToScore[nodesToScore.length] = node;
}
catch(e) {
dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
}
}
else
{
/* EXPERIMENTAL */
for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
var childNode = node.childNodes[i];
if(childNode.nodeType === 3) { // Node.TEXT_NODE
var p = document.createElement('p');
var t = document.createTextNode(childNode.nodeValue);
p.appendChild(t);
p.style.display = 'inline';
p.className = 'readability-styled';
childNode.parentNode.replaceChild(p, childNode);
}
}
}
}
}
/**
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
var candidates = [];
for (var pt=0; pt < nodesToScore.length; pt+=1) {
var parentNode = nodesToScore[pt].parentNode;
var grandParentNode = parentNode ? parentNode.parentNode : null;
var innerText = readability.getInnerText(nodesToScore[pt]);
if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
continue;
}
/* If this paragraph is less than 25 characters, don't even count it. */
if(innerText.length < 25) {
continue; }
/* Initialize readability data for the parent. */
if(typeof parentNode.readability === 'undefined') {
readability.initializeNode(parentNode);
candidates.push(parentNode);
}
/* Initialize readability data for the grandparent. */
if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
readability.initializeNode(grandParentNode);
candidates.push(grandParentNode);
}
var contentScore = 0;
/* Add a point for the paragraph itself as a base. */
contentScore+=1;
/* Add points for any commas within this paragraph */
contentScore += innerText.split(',').length;
/* For every 100 characters in this paragraph, add another point. Up to 3 points. */
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
/* Add the score to the parent. The grandparent gets half. */
parentNode.readability.contentScore += contentScore;
if(grandParentNode) {
grandParentNode.readability.contentScore += contentScore/2;
}
}
/**
* After we've calculated scores, loop through all of the possible candidate nodes we found
* and find the one with the highest score.
**/
var topCandidate = null;
for(var c=0, cl=candidates.length; c < cl; c+=1)
{
/**
* Scale the final candidates score based on link density. Good content should have a
* relatively small link density (5% or less) and be mostly unaffected by this operation.
**/
candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
topCandidate = candidates[c]; }
}
/**
* If we still have no top candidate, just use the body as a last resort.
* We also have to copy the body node so it is something we can modify.
**/
if (topCandidate === null || topCandidate.tagName === "BODY")
{
topCandidate = document.createElement("DIV");
readability.replaceNodeInnards(page, topCandidate);
page.appendChild(topCandidate);
readability.initializeNode(topCandidate);
}
/**
* Now that we have the top candidate, look through its siblings for content that might also be related.
* Things like preambles, content split by ads that we removed, etc.
**/
var articleContent = document.createElement("DIV");
if (isPaging) {
articleContent.id = "readability-content";
}
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblingNodes = topCandidate.parentNode.childNodes;
for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
var siblingNode = siblingNodes[s];
var append = false;
/**
* Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
* Example of error visible here: http://www.esquire.com/features/honesty0707
**/
if(!siblingNode) {
continue;
}
dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
if(siblingNode === topCandidate)
{
append = true;
}
var contentBonus = 0;
/* Give a bonus if sibling nodes and top candidates have the example same classname */
if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
contentBonus += topCandidate.readability.contentScore * 0.2;
}
if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
{
append = true;
}
if(siblingNode.nodeName === "P") {
var linkDensity = readability.getLinkDensity(siblingNode);
var nodeContent = readability.getInnerText(siblingNode);
var nodeLength = nodeContent.length;
if(nodeLength > 80 && linkDensity < 0.25)
{
append = true;
}
else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
{
append = true;
}
}
if(append) {
dbg("Appending node: " + siblingNode);
var nodeToAppend = null;
if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
nodeToAppend = document.createElement("DIV");
try {
nodeToAppend.id = siblingNode.id;
readability.moveNodeInnards(siblingNode, nodeToAppend);
}
catch(er) {
dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
nodeToAppend = siblingNode;
s-=1;
sl-=1;
}
} else {
nodeToAppend = siblingNode;
s-=1;
sl-=1;
}
/* To ensure a node does not interfere with readability styles, remove its classnames */
nodeToAppend.className = "";
/* Append sibling and subtract from our list because it removes the node when you append to another node */
articleContent.appendChild(nodeToAppend);
}
}
/**
* So we have all of the content that we need. Now we clean it up for presentation.
**/
readability.distilledArticleContent = articleContent.cloneNode(true);
//readability.prepArticle(articleContent);
if (readability.curPageNum === 1) {
var newNode = document.createElement('div');
newNode.id = "readability-page-1";
newNode.setAttribute("class", "page");
readability.moveNodeInnards(articleContent, newNode);
articleContent.appendChild(newNode);
}
/**
* Now that we've gone through the full algorithm, check to see if we got any meaningful content.
* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
* likelihood of finding the content, and the sieve approach gives us a higher likelihood of
* finding the -right- content.
**/
if(readability.getInnerText(articleContent, false).length < 250) {
if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
return readability.grabArticle(document.body);
}
else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
return readability.grabArticle(document.body);
}
else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
return readability.grabArticle(document.body);
} else {
return null;
}
}
return articleContent;
},
/**
* Removes script tags from the document.
*
* @param Element
**/
removeScripts: function (doc) {
var scripts = doc.getElementsByTagName('script');
for(var i = scripts.length-1; i >= 0; i-=1)
{
if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
{
scripts[i].nodeValue="";
scripts[i].removeAttribute('src');
if (scripts[i].parentNode) {
scripts[i].parentNode.removeChild(scripts[i]);
}
}
}
},
/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.
*
* @param Element
* @return string
**/
getInnerText: function (e, normalizeSpaces) {
var textContent = "";
if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") {
return "";
}
normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
if (navigator.appName === "Microsoft Internet Explorer") {
textContent = e.innerText.replace( readability.regexps.trim, "" ); }
else {
textContent = e.textContent.replace( readability.regexps.trim, "" ); }
if(normalizeSpaces) {
return textContent.replace( readability.regexps.normalize, " "); }
else {
return textContent; }
},
/**
* Get the number of times a string s appears in the node e.
*
* @param Element
* @param string - what to split on. Default is ","
* @return number (integer)
**/
getCharCount: function (e,s) {
s = s || ",";
return readability.getInnerText(e).split(s).length-1;
},
/**
* Remove the style attribute on every e and under.
* TODO: Test if getElementsByTagName(*) is faster.
*
* @param Element
* @return void
**/
cleanStyles: function (e) {
e = e || document;
var cur = e.firstChild;
if(!e) {
return; }
// Remove any root styles, if we're able.
if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') {
e.removeAttribute('style'); }
// Go until there are no more child nodes
while ( cur !== null ) {
if ( cur.nodeType === 1 ) {
// Remove style attribute(s) :
if(cur.className !== "readability-styled") {
cur.removeAttribute("style");
}
readability.cleanStyles( cur );
}
cur = cur.nextSibling;
}
},
/**
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
*
* @param Element
* @return number (float)
**/
getLinkDensity: function (e) {
var links = e.getElementsByTagName("a");
var textLength = readability.getInnerText(e).length;
var linkLength = 0;
for(var i=0, il=links.length; i § node, and makes all next siblings of that pair children of , up
// until the next pair of tags.
replaceDoubleBrsWithPs: function(node) {
var allElements = node.getElementsByTagName('BR');
var node = null;
while (allElements && allElements.length > 0 &&
readability.hasDoubleBr(allElements)) {
for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
var next = node;
while (next = readability.replaceDoubleBrWithP(next));
}
allElements = document.body.getElementsByTagName('BR');
}
},
// Replaces a BR and the whitespace that follows it with a P.
replaceBrWithP: function(node) {
if (!readability.isBrNode(node)) {
return;
}
var p = document.createElement('p');
var curr = node.nextSibling;
while (curr && !isBrNode(curr)) {
var next = curr.nextSibling;
if (readability.isWhitespaceNode(curr)) {
curr.parentNode.removeChild(curr);
} else {
p.appendChild(curr.parentNode.removeChild(curr));
}
curr = next;
}
node.parentNode.replaceChild(p, node);
return curr;
},
// Replaces all tags. Makes all next siblings of a .
replaceBrsWithPs: function(node) {
var allElements = node.getElementsByTagName('BR');
var node = null;
while (allElements && allElements.length > 0) {
for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
var next = node;
while (next = readability.replaceBrWithP(next));
}
allElements = document.body.getElementsByTagName('BR');
}
},
// Replaces any tag with any other tag.
replaceTagsWithTags: function(node, srcTag, destTag) {
var allElements = node.getElementsByTagName(srcTag);
for (var i = 0; i < allElements.length; i++) {
var dest = document.createElement(destTag);
readability.moveNodeInnards(allElements[i], dest);
allElements[i].parentNode.replaceChild(dest, allElements[i]);
}
},
// Replaces all
.
isBrNode: function(node) {
return (node.tagName === 'BR');
},
// Returns the last
node in a sequence of
nodes that are only
// separated by whitespace, or null if there are not at least two
tags
// in the sibling chain starting with |node|. Returns the second such
// node if |restrictToTwo| is true.
isMultipleBr: function(node, restrictToTwo) {
var lastBr = null;
if (!readability.isBrNode(node)) {
return lastBr;
}
var curr = node.nextSibling;
while (curr) {
if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) {
lastBr = curr;
curr = curr.nextSibling;
if (restrictToTwo) {
if (readability.isBrNode(lastBr)) {
return lastBr;
}
}
continue;
}
break;
}
return lastBr;
},
// Removes all
nodes except one and whitespace in between in a series
// of
nodes.
deleteExtraBreaks: function(node) {
var lastBr = readability.isMultipleBr(node, false);
var ret = false;
while (lastBr && lastBr != node) {
var toRemove = lastBr;
lastBr = lastBr.previousSibling;
toRemove.parentNode.removeChild(toRemove);
ret = true;
}
return ret;
},
// Replaces a pair of
nodes (possibly separated by whitespace), with a
//
nodes is reached.
replaceDoubleBrWithP: function(node) {
// Check that we are starting with a BR.
var second = readability.isMultipleBr(node, true);
if (!second) {
return;
}
// Make all next siblings of the second BR into children of a P.
var p = document.createElement('p');
var curr = second.nextSibling;
while (curr) {
if (readability.isMultipleBr(curr, true)) {
break;
}
var next = curr.nextSibling;
p.appendChild(curr.parentNode.removeChild(curr));
curr = next;
}
var ret = curr;
// Remove all nodes between the first and second BR.
curr = node.nextSibling;
while (curr && curr != second) {
var next = curr.nextSibling;
curr.parentNode.removeChild(curr);
curr = next;
}
// Remove the second BR.
second.parentNode.removeChild(second);
// Replace the first BR with the P.
node.parentNode.replaceChild(p, node);
return ret;
},
// Returns true if the NodeList contains a double
.
hasDoubleBr: function(nodeList) {
for (var i = 0; i < nodeList.length; nodeList++) {
if (readability.isMultipleBr(nodeList[i], true)) {
return true;
}
}
return false;
},
// Replaces double
tags with
tags with
tag
// children of the