#595: pull Readability to tip

This commit is contained in:
Cameron Kaiser 2020-03-14 21:45:24 -07:00
parent ce38568dfb
commit 2777050abd
3 changed files with 43 additions and 24 deletions

View File

@ -308,6 +308,7 @@
} }
} }
getElems(this); getElems(this);
elems._isLiveNodeList = true;
return elems; return elems;
} }

View File

@ -29,9 +29,11 @@ var REGEXPS = {
}; };
function isNodeVisible(node) { function isNodeVisible(node) {
// Have to null-check node.style to deal with SVG and MathML nodes. // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden") return (!node.style || node.style.display != "none")
&& (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true"); && !node.hasAttribute("hidden")
//check for "fallback-image" so that wikimedia math images are displayed
&& (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
} }
/** /**

View File

@ -36,6 +36,7 @@ function Readability(doc, options) {
options = options || {}; options = options || {};
this._doc = doc; this._doc = doc;
this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
this._articleTitle = null; this._articleTitle = null;
this._articleByline = null; this._articleByline = null;
this._articleDir = null; this._articleDir = null;
@ -181,6 +182,10 @@ Readability.prototype = {
* @return void * @return void
*/ */
_removeNodes: function(nodeList, filterFn) { _removeNodes: function(nodeList, filterFn) {
// Avoid ever operating on live node lists.
if (this._docJSDOMParser && nodeList._isLiveNodeList) {
throw new Error("Do not pass live node lists to _removeNodes");
}
for (var i = nodeList.length - 1; i >= 0; i--) { for (var i = nodeList.length - 1; i >= 0; i--) {
var node = nodeList[i]; var node = nodeList[i];
var parentNode = node.parentNode; var parentNode = node.parentNode;
@ -200,6 +205,10 @@ Readability.prototype = {
* @return void * @return void
*/ */
_replaceNodeTags: function(nodeList, newTagName) { _replaceNodeTags: function(nodeList, newTagName) {
// Avoid ever operating on live node lists.
if (this._docJSDOMParser && nodeList._isLiveNodeList) {
throw new Error("Do not pass live node lists to _replaceNodeTags");
}
for (var i = nodeList.length - 1; i >= 0; i--) { for (var i = nodeList.length - 1; i >= 0; i--) {
var node = nodeList[i]; var node = nodeList[i];
this._setNodeTag(node, newTagName); this._setNodeTag(node, newTagName);
@ -332,11 +341,21 @@ Readability.prototype = {
this._forEachNode(links, function(link) { this._forEachNode(links, function(link) {
var href = link.getAttribute("href"); var href = link.getAttribute("href");
if (href) { if (href) {
// Replace links with javascript: URIs with text content, since // Remove links with javascript: URIs, since
// they won't work after scripts have been removed from the page. // they won't work after scripts have been removed from the page.
if (href.indexOf("javascript:") === 0) { if (href.indexOf("javascript:") === 0) {
var text = this._doc.createTextNode(link.textContent); // if the link only contains simple text content, it can be converted to a text node
link.parentNode.replaceChild(text, link); if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
var text = this._doc.createTextNode(link.textContent);
link.parentNode.replaceChild(text, link);
} else {
// if the link has multiple children, they should all be preserved
var container = this._doc.createElement("span");
while (link.childNodes.length > 0) {
container.appendChild(link.childNodes[0]);
}
link.parentNode.replaceChild(container, link);
}
} else { } else {
link.setAttribute("href", toAbsoluteURI(href)); link.setAttribute("href", toAbsoluteURI(href));
} }
@ -441,13 +460,13 @@ Readability.prototype = {
var doc = this._doc; var doc = this._doc;
// Remove all style tags in head // Remove all style tags in head
this._removeNodes(doc.getElementsByTagName("style")); this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
if (doc.body) { if (doc.body) {
this._replaceBrs(doc.body); this._replaceBrs(doc.body);
} }
this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN"); this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
}, },
/** /**
@ -527,7 +546,7 @@ Readability.prototype = {
_setNodeTag: function (node, tag) { _setNodeTag: function (node, tag) {
this.log("_setNodeTag", node, tag); this.log("_setNodeTag", node, tag);
if (node.__JSDOMParser__) { if (this._docJSDOMParser) {
node.localName = tag.toLowerCase(); node.localName = tag.toLowerCase();
node.tagName = tag.toUpperCase(); node.tagName = tag.toUpperCase();
return node; return node;
@ -627,7 +646,7 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "div"); this._cleanConditionally(articleContent, "div");
// Remove extra paragraphs // Remove extra paragraphs
this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) { this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) {
var imgCount = paragraph.getElementsByTagName("img").length; var imgCount = paragraph.getElementsByTagName("img").length;
var embedCount = paragraph.getElementsByTagName("embed").length; var embedCount = paragraph.getElementsByTagName("embed").length;
var objectCount = paragraph.getElementsByTagName("object").length; var objectCount = paragraph.getElementsByTagName("object").length;
@ -1303,12 +1322,12 @@ Readability.prototype = {
* @param Element * @param Element
**/ **/
_removeScripts: function(doc) { _removeScripts: function(doc) {
this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) { this._removeNodes(this._getAllNodesWithTag(doc, ["script"]), function(scriptNode) {
scriptNode.nodeValue = ""; scriptNode.nodeValue = "";
scriptNode.removeAttribute("src"); scriptNode.removeAttribute("src");
return true; return true;
}); });
this._removeNodes(doc.getElementsByTagName("noscript")); this._removeNodes(this._getAllNodesWithTag(doc, ["noscript"]));
}, },
/** /**
@ -1491,7 +1510,7 @@ Readability.prototype = {
_clean: function(e, tag) { _clean: function(e, tag) {
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1; var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
this._removeNodes(e.getElementsByTagName(tag), function(element) { this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(element) {
// Allow youtube and vimeo videos through as people usually want to see those. // Allow youtube and vimeo videos through as people usually want to see those.
if (isEmbed) { if (isEmbed) {
// First, check the elements attributes to see if any of them contain youtube or vimeo // First, check the elements attributes to see if any of them contain youtube or vimeo
@ -1672,7 +1691,7 @@ Readability.prototype = {
// without effecting the traversal. // without effecting the traversal.
// //
// TODO: Consider taking into account original contentScore here. // TODO: Consider taking into account original contentScore here.
this._removeNodes(e.getElementsByTagName(tag), function(node) { this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(node) {
// First check if this node IS data table, in which case don't remove it. // First check if this node IS data table, in which case don't remove it.
var isDataTable = function(t) { var isDataTable = function(t) {
return t._readabilityDataTable; return t._readabilityDataTable;
@ -1706,10 +1725,7 @@ Readability.prototype = {
var input = node.getElementsByTagName("input").length; var input = node.getElementsByTagName("input").length;
var embedCount = 0; var embedCount = 0;
var embeds = this._concatNodeLists( var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]);
node.getElementsByTagName("object"),
node.getElementsByTagName("embed"),
node.getElementsByTagName("iframe"));
for (var i = 0; i < embeds.length; i++) { for (var i = 0; i < embeds.length; i++) {
// If this embed has attribute that matches video regex, don't delete it. // If this embed has attribute that matches video regex, don't delete it.
@ -1770,11 +1786,9 @@ Readability.prototype = {
* @return void * @return void
**/ **/
_cleanHeaders: function(e) { _cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) { this._removeNodes(this._getAllNodesWithTag(e, ["h1", "h2"]), function (header) {
this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) { return this._getClassWeight(header) < 0;
return this._getClassWeight(header) < 0; });
});
}
}, },
_flagIsActive: function(flag) { _flagIsActive: function(flag) {
@ -1786,9 +1800,11 @@ Readability.prototype = {
}, },
_isProbablyVisible: function(node) { _isProbablyVisible: function(node) {
// Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
return (!node.style || node.style.display != "none") return (!node.style || node.style.display != "none")
&& !node.hasAttribute("hidden") && !node.hasAttribute("hidden")
&& (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true"); //check for "fallback-image" so that wikimedia math images are displayed
&& (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
}, },
/** /**