mirror of
https://github.com/classilla/tenfourfox.git
synced 2025-01-16 01:31:37 +00:00
closes #624: update Readability to tip
This commit is contained in:
parent
b6b1905d52
commit
92be8e3bec
@ -278,7 +278,7 @@
|
|||||||
|
|
||||||
var whitespace = [" ", "\t", "\n", "\r"];
|
var whitespace = [" ", "\t", "\n", "\r"];
|
||||||
|
|
||||||
// See http://www.w3schools.com/dom/dom_nodetype.asp
|
// See https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
|
||||||
var nodeTypes = {
|
var nodeTypes = {
|
||||||
ELEMENT_NODE: 1,
|
ELEMENT_NODE: 1,
|
||||||
ATTRIBUTE_NODE: 2,
|
ATTRIBUTE_NODE: 2,
|
||||||
@ -705,7 +705,6 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Using Array.join() avoids the overhead from lazy string concatenation.
|
// Using Array.join() avoids the overhead from lazy string concatenation.
|
||||||
// See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes
|
|
||||||
var arr = [];
|
var arr = [];
|
||||||
getHTML(this);
|
getHTML(this);
|
||||||
return arr.join("");
|
return arr.join("");
|
||||||
@ -875,7 +874,11 @@
|
|||||||
|
|
||||||
JSDOMParser.prototype = {
|
JSDOMParser.prototype = {
|
||||||
error: function(m) {
|
error: function(m) {
|
||||||
dump("JSDOMParser error: " + m + "\n");
|
if (typeof dump !== "undefined") {
|
||||||
|
dump("JSDOMParser error: " + m + "\n");
|
||||||
|
} else if (typeof console !== "undefined") {
|
||||||
|
console.log("JSDOMParser error: " + m + "\n");
|
||||||
|
}
|
||||||
this.errorState += m + "\n";
|
this.errorState += m + "\n";
|
||||||
},
|
},
|
||||||
|
|
||||||
@ -1187,3 +1190,7 @@
|
|||||||
global.JSDOMParser = JSDOMParser;
|
global.JSDOMParser = JSDOMParser;
|
||||||
|
|
||||||
})(this);
|
})(this);
|
||||||
|
|
||||||
|
if (typeof module === "object") {
|
||||||
|
module.exports = this.JSDOMParser;
|
||||||
|
}
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
/* eslint-env es6:false */
|
/* eslint-env es6:false */
|
||||||
/* globals exports */
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2010 Arc90 Inc
|
* Copyright (c) 2010 Arc90 Inc
|
||||||
*
|
*
|
||||||
@ -95,6 +94,6 @@ function isProbablyReaderable(doc, isVisible) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (typeof exports === "object") {
|
if (typeof module === "object") {
|
||||||
exports.isProbablyReaderable = isProbablyReaderable;
|
module.exports = isProbablyReaderable;
|
||||||
}
|
}
|
||||||
|
@ -50,6 +50,10 @@ function Readability(doc, options) {
|
|||||||
this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
|
this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
|
||||||
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
|
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
|
||||||
this._keepClasses = !!options.keepClasses;
|
this._keepClasses = !!options.keepClasses;
|
||||||
|
this._serializer = options.serializer || function(el) {
|
||||||
|
return el.innerHTML;
|
||||||
|
};
|
||||||
|
this._disableJSONLD = !!options.disableJSONLD;
|
||||||
|
|
||||||
// Start with all flags set
|
// Start with all flags set
|
||||||
this._flags = this.FLAG_STRIP_UNLIKELYS |
|
this._flags = this.FLAG_STRIP_UNLIKELYS |
|
||||||
@ -131,8 +135,14 @@ Readability.prototype = {
|
|||||||
prevLink: /(prev|earl|old|new|<|«)/i,
|
prevLink: /(prev|earl|old|new|<|«)/i,
|
||||||
whitespace: /^\s*$/,
|
whitespace: /^\s*$/,
|
||||||
hasContent: /\S$/,
|
hasContent: /\S$/,
|
||||||
|
srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
|
||||||
|
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
|
||||||
|
// See: https://schema.org/Article
|
||||||
|
jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/
|
||||||
},
|
},
|
||||||
|
|
||||||
|
UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ],
|
||||||
|
|
||||||
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
|
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
|
||||||
|
|
||||||
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
|
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
|
||||||
@ -155,6 +165,15 @@ Readability.prototype = {
|
|||||||
// These are the classes that readability sets itself.
|
// These are the classes that readability sets itself.
|
||||||
CLASSES_TO_PRESERVE: [ "page" ],
|
CLASSES_TO_PRESERVE: [ "page" ],
|
||||||
|
|
||||||
|
// These are the list of HTML entities that need to be escaped.
|
||||||
|
HTML_ESCAPE_MAP: {
|
||||||
|
"lt": "<",
|
||||||
|
"gt": ">",
|
||||||
|
"amp": "&",
|
||||||
|
"quot": '"',
|
||||||
|
"apos": "'",
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run any post-process modifications to article content as necessary.
|
* Run any post-process modifications to article content as necessary.
|
||||||
*
|
*
|
||||||
@ -165,6 +184,8 @@ Readability.prototype = {
|
|||||||
// Readability cannot open relative uris so we convert them to absolute uris.
|
// Readability cannot open relative uris so we convert them to absolute uris.
|
||||||
this._fixRelativeUris(articleContent);
|
this._fixRelativeUris(articleContent);
|
||||||
|
|
||||||
|
this._simplifyNestedElements(articleContent);
|
||||||
|
|
||||||
if (!this._keepClasses) {
|
if (!this._keepClasses) {
|
||||||
// Remove classes.
|
// Remove classes.
|
||||||
this._cleanClasses(articleContent);
|
this._cleanClasses(articleContent);
|
||||||
@ -230,6 +251,21 @@ Readability.prototype = {
|
|||||||
Array.prototype.forEach.call(nodeList, fn, this);
|
Array.prototype.forEach.call(nodeList, fn, this);
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Iterate over a NodeList, and return the first node that passes
|
||||||
|
* the supplied test function
|
||||||
|
*
|
||||||
|
* For convenience, the current object context is applied to the provided
|
||||||
|
* test function.
|
||||||
|
*
|
||||||
|
* @param NodeList nodeList The NodeList.
|
||||||
|
* @param Function fn The test function.
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
_findNode: function(nodeList, fn) {
|
||||||
|
return Array.prototype.find.call(nodeList, fn, this);
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Iterate over a NodeList, return true if any of the provided iterate
|
* Iterate over a NodeList, return true if any of the provided iterate
|
||||||
* function calls returns true, false otherwise.
|
* function calls returns true, false otherwise.
|
||||||
@ -328,6 +364,7 @@ Readability.prototype = {
|
|||||||
if (baseURI == documentURI && uri.charAt(0) == "#") {
|
if (baseURI == documentURI && uri.charAt(0) == "#") {
|
||||||
return uri;
|
return uri;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise, resolve against base URI:
|
// Otherwise, resolve against base URI:
|
||||||
try {
|
try {
|
||||||
return new URL(uri, baseURI).href;
|
return new URL(uri, baseURI).href;
|
||||||
@ -362,15 +399,56 @@ Readability.prototype = {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
|
var medias = this._getAllNodesWithTag(articleContent, [
|
||||||
this._forEachNode(imgs, function(img) {
|
"img", "picture", "figure", "video", "audio", "source"
|
||||||
var src = img.getAttribute("src");
|
]);
|
||||||
|
|
||||||
|
this._forEachNode(medias, function(media) {
|
||||||
|
var src = media.getAttribute("src");
|
||||||
|
var poster = media.getAttribute("poster");
|
||||||
|
var srcset = media.getAttribute("srcset");
|
||||||
|
|
||||||
if (src) {
|
if (src) {
|
||||||
img.setAttribute("src", toAbsoluteURI(src));
|
media.setAttribute("src", toAbsoluteURI(src));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (poster) {
|
||||||
|
media.setAttribute("poster", toAbsoluteURI(poster));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (srcset) {
|
||||||
|
var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) {
|
||||||
|
return toAbsoluteURI(p1) + (p2 || "") + p3;
|
||||||
|
});
|
||||||
|
|
||||||
|
media.setAttribute("srcset", newSrcset);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
|
_simplifyNestedElements: function(articleContent) {
|
||||||
|
var node = articleContent;
|
||||||
|
|
||||||
|
while (node) {
|
||||||
|
if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) {
|
||||||
|
if (this._isElementWithoutContent(node)) {
|
||||||
|
node = this._removeAndGetNext(node);
|
||||||
|
continue;
|
||||||
|
} else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) {
|
||||||
|
var child = node.children[0];
|
||||||
|
for (var i = 0; i < node.attributes.length; i++) {
|
||||||
|
child.setAttribute(node.attributes[i].name, node.attributes[i].value);
|
||||||
|
}
|
||||||
|
node.parentNode.replaceChild(child, node);
|
||||||
|
node = child;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
node = this._getNextNode(node);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the article title as an H1.
|
* Get the article title as an H1.
|
||||||
*
|
*
|
||||||
@ -840,6 +918,12 @@ Readability.prototype = {
|
|||||||
node = this._removeAndGetNext(node);
|
node = this._removeAndGetNext(node);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) {
|
||||||
|
this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString);
|
||||||
|
node = this._removeAndGetNext(node);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
|
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
|
||||||
@ -913,7 +997,7 @@ Readability.prototype = {
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
// Exclude nodes with no ancestor.
|
// Exclude nodes with no ancestor.
|
||||||
var ancestors = this._getNodeAncestors(elementToScore, 3);
|
var ancestors = this._getNodeAncestors(elementToScore, 5);
|
||||||
if (ancestors.length === 0)
|
if (ancestors.length === 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@ -1233,12 +1317,111 @@ Readability.prototype = {
|
|||||||
return false;
|
return false;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts some of the common HTML entities in string to their corresponding characters.
|
||||||
|
*
|
||||||
|
* @param str {string} - a string to unescape.
|
||||||
|
* @return string without HTML entity.
|
||||||
|
*/
|
||||||
|
_unescapeHtmlEntities: function(str) {
|
||||||
|
if (!str) {
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
var htmlEscapeMap = this.HTML_ESCAPE_MAP;
|
||||||
|
return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) {
|
||||||
|
return htmlEscapeMap[tag];
|
||||||
|
}).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) {
|
||||||
|
var num = parseInt(hex || numStr, hex ? 16 : 10);
|
||||||
|
return String.fromCharCode(num);
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Try to extract metadata from JSON-LD object.
|
||||||
|
* For now, only Schema.org objects of type Article or its subtypes are supported.
|
||||||
|
* @return Object with any metadata that could be extracted (possibly none)
|
||||||
|
*/
|
||||||
|
_getJSONLD: function (doc) {
|
||||||
|
var scripts = this._getAllNodesWithTag(doc, ["script"]);
|
||||||
|
|
||||||
|
var jsonLdElement = this._findNode(scripts, function(el) {
|
||||||
|
return el.getAttribute("type") === "application/ld+json";
|
||||||
|
});
|
||||||
|
|
||||||
|
if (jsonLdElement) {
|
||||||
|
try {
|
||||||
|
// Strip CDATA markers if present
|
||||||
|
var content = jsonLdElement.textContent.replace(/^\s*<!\[CDATA\[|\]\]>\s*$/g, "");
|
||||||
|
var parsed = JSON.parse(content);
|
||||||
|
var metadata = {};
|
||||||
|
if (
|
||||||
|
!parsed["@context"] ||
|
||||||
|
!parsed["@context"].match(/^https?\:\/\/schema\.org$/)
|
||||||
|
) {
|
||||||
|
return metadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
|
||||||
|
parsed = parsed["@graph"].find(function(it) {
|
||||||
|
return (it["@type"] || "").match(
|
||||||
|
this.REGEXPS.jsonLdArticleTypes
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
!parsed ||
|
||||||
|
!parsed["@type"] ||
|
||||||
|
!parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes)
|
||||||
|
) {
|
||||||
|
return metadata;
|
||||||
|
}
|
||||||
|
if (typeof parsed.name === "string") {
|
||||||
|
metadata.title = parsed.name.trim();
|
||||||
|
} else if (typeof parsed.headline === "string") {
|
||||||
|
metadata.title = parsed.headline.trim();
|
||||||
|
}
|
||||||
|
if (parsed.author) {
|
||||||
|
if (typeof parsed.author.name === "string") {
|
||||||
|
metadata.byline = parsed.author.name.trim();
|
||||||
|
} else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") {
|
||||||
|
metadata.byline = parsed.author
|
||||||
|
.filter(function(author) {
|
||||||
|
return author && typeof author.name === "string";
|
||||||
|
})
|
||||||
|
.map(function(author) {
|
||||||
|
return author.name.trim();
|
||||||
|
})
|
||||||
|
.join(", ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (typeof parsed.description === "string") {
|
||||||
|
metadata.excerpt = parsed.description.trim();
|
||||||
|
}
|
||||||
|
if (
|
||||||
|
parsed.publisher &&
|
||||||
|
typeof parsed.publisher.name === "string"
|
||||||
|
) {
|
||||||
|
metadata.siteName = parsed.publisher.name.trim();
|
||||||
|
}
|
||||||
|
return metadata;
|
||||||
|
} catch (err) {
|
||||||
|
this.log(err.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {};
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Attempts to get excerpt and byline metadata for the article.
|
* Attempts to get excerpt and byline metadata for the article.
|
||||||
*
|
*
|
||||||
|
* @param {Object} jsonld — object containing any metadata that
|
||||||
|
* could be extracted from JSON-LD object.
|
||||||
|
*
|
||||||
* @return Object with optional "excerpt" and "byline" properties
|
* @return Object with optional "excerpt" and "byline" properties
|
||||||
*/
|
*/
|
||||||
_getArticleMetadata: function() {
|
_getArticleMetadata: function(jsonld) {
|
||||||
var metadata = {};
|
var metadata = {};
|
||||||
var values = {};
|
var values = {};
|
||||||
var metaElements = this._doc.getElementsByTagName("meta");
|
var metaElements = this._doc.getElementsByTagName("meta");
|
||||||
@ -1284,7 +1467,8 @@ Readability.prototype = {
|
|||||||
});
|
});
|
||||||
|
|
||||||
// get title
|
// get title
|
||||||
metadata.title = values["dc:title"] ||
|
metadata.title = jsonld.title ||
|
||||||
|
values["dc:title"] ||
|
||||||
values["dcterm:title"] ||
|
values["dcterm:title"] ||
|
||||||
values["og:title"] ||
|
values["og:title"] ||
|
||||||
values["weibo:article:title"] ||
|
values["weibo:article:title"] ||
|
||||||
@ -1297,12 +1481,14 @@ Readability.prototype = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// get author
|
// get author
|
||||||
metadata.byline = values["dc:creator"] ||
|
metadata.byline = jsonld.byline ||
|
||||||
|
values["dc:creator"] ||
|
||||||
values["dcterm:creator"] ||
|
values["dcterm:creator"] ||
|
||||||
values["author"];
|
values["author"];
|
||||||
|
|
||||||
// get description
|
// get description
|
||||||
metadata.excerpt = values["dc:description"] ||
|
metadata.excerpt = jsonld.excerpt ||
|
||||||
|
values["dc:description"] ||
|
||||||
values["dcterm:description"] ||
|
values["dcterm:description"] ||
|
||||||
values["og:description"] ||
|
values["og:description"] ||
|
||||||
values["weibo:article:description"] ||
|
values["weibo:article:description"] ||
|
||||||
@ -1311,11 +1497,114 @@ Readability.prototype = {
|
|||||||
values["twitter:description"];
|
values["twitter:description"];
|
||||||
|
|
||||||
// get site name
|
// get site name
|
||||||
metadata.siteName = values["og:site_name"];
|
metadata.siteName = jsonld.siteName ||
|
||||||
|
values["og:site_name"];
|
||||||
|
|
||||||
|
// in many sites the meta value is escaped with HTML entities,
|
||||||
|
// so here we need to unescape it
|
||||||
|
metadata.title = this._unescapeHtmlEntities(metadata.title);
|
||||||
|
metadata.byline = this._unescapeHtmlEntities(metadata.byline);
|
||||||
|
metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
|
||||||
|
metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
|
||||||
|
|
||||||
return metadata;
|
return metadata;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if node is image, or if node contains exactly only one image
|
||||||
|
* whether as a direct child or as its descendants.
|
||||||
|
*
|
||||||
|
* @param Element
|
||||||
|
**/
|
||||||
|
_isSingleImage: function(node) {
|
||||||
|
if (node.tagName === "IMG") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node.children.length !== 1 || node.textContent.trim() !== "") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return this._isSingleImage(node.children[0]);
|
||||||
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find all <noscript> that are located after <img> nodes, and which contain only one
|
||||||
|
* <img> element. Replace the first image with the image from inside the <noscript> tag,
|
||||||
|
* and remove the <noscript> tag. This improves the quality of the images we use on
|
||||||
|
* some sites (e.g. Medium).
|
||||||
|
*
|
||||||
|
* @param Element
|
||||||
|
**/
|
||||||
|
_unwrapNoscriptImages: function(doc) {
|
||||||
|
// Find img without source or attributes that might contains image, and remove it.
|
||||||
|
// This is done to prevent a placeholder img is replaced by img from noscript in next step.
|
||||||
|
var imgs = Array.from(doc.getElementsByTagName("img"));
|
||||||
|
this._forEachNode(imgs, function(img) {
|
||||||
|
for (var i = 0; i < img.attributes.length; i++) {
|
||||||
|
var attr = img.attributes[i];
|
||||||
|
switch (attr.name) {
|
||||||
|
case "src":
|
||||||
|
case "srcset":
|
||||||
|
case "data-src":
|
||||||
|
case "data-srcset":
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
img.parentNode.removeChild(img);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Next find noscript and try to extract its image
|
||||||
|
var noscripts = Array.from(doc.getElementsByTagName("noscript"));
|
||||||
|
this._forEachNode(noscripts, function(noscript) {
|
||||||
|
// Parse content of noscript and make sure it only contains image
|
||||||
|
var tmp = doc.createElement("div");
|
||||||
|
tmp.innerHTML = noscript.innerHTML;
|
||||||
|
if (!this._isSingleImage(tmp)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If noscript has previous sibling and it only contains image,
|
||||||
|
// replace it with noscript content. However we also keep old
|
||||||
|
// attributes that might contains image.
|
||||||
|
var prevElement = noscript.previousElementSibling;
|
||||||
|
if (prevElement && this._isSingleImage(prevElement)) {
|
||||||
|
var prevImg = prevElement;
|
||||||
|
if (prevImg.tagName !== "IMG") {
|
||||||
|
prevImg = prevElement.getElementsByTagName("img")[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
var newImg = tmp.getElementsByTagName("img")[0];
|
||||||
|
for (var i = 0; i < prevImg.attributes.length; i++) {
|
||||||
|
var attr = prevImg.attributes[i];
|
||||||
|
if (attr.value === "") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (attr.name === "src" || attr.name === "srcset" || /\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
|
||||||
|
if (newImg.getAttribute(attr.name) === attr.value) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var attrName = attr.name;
|
||||||
|
if (newImg.hasAttribute(attrName)) {
|
||||||
|
attrName = "data-old-" + attrName;
|
||||||
|
}
|
||||||
|
|
||||||
|
newImg.setAttribute(attrName, attr.value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes script tags from the document.
|
* Removes script tags from the document.
|
||||||
*
|
*
|
||||||
@ -1644,30 +1933,67 @@ Readability.prototype = {
|
|||||||
/* convert images and figures that have properties like data-src into images that can be loaded without JS */
|
/* convert images and figures that have properties like data-src into images that can be loaded without JS */
|
||||||
_fixLazyImages: function (root) {
|
_fixLazyImages: function (root) {
|
||||||
this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
|
this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
|
||||||
// also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
|
// In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
|
||||||
if ((!elem.src && (!elem.srcset || elem.srcset == "null")) || elem.className.toLowerCase().indexOf("lazy") !== -1) {
|
// So, here we check if the data uri is too short, just might as well remove it.
|
||||||
|
if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
|
||||||
|
// Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
|
||||||
|
var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
|
||||||
|
if (parts[1] === "image/svg+xml") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure this element has other attributes which contains image.
|
||||||
|
// If it doesn't, then this src is important and shouldn't be removed.
|
||||||
|
var srcCouldBeRemoved = false;
|
||||||
for (var i = 0; i < elem.attributes.length; i++) {
|
for (var i = 0; i < elem.attributes.length; i++) {
|
||||||
var attr = elem.attributes[i];
|
var attr = elem.attributes[i];
|
||||||
if (attr.name === "src" || attr.name === "srcset") {
|
if (attr.name === "src") {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
var copyTo = null;
|
|
||||||
if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
|
if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
|
||||||
copyTo = "srcset";
|
srcCouldBeRemoved = true;
|
||||||
} else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
|
break;
|
||||||
copyTo = "src";
|
|
||||||
}
|
}
|
||||||
if (copyTo) {
|
}
|
||||||
//if this is an img or picture, set the attribute directly
|
|
||||||
if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
|
// Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
|
||||||
elem.setAttribute(copyTo, attr.value);
|
// it will be too small, therefore it might be placeholder image.
|
||||||
} else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
|
if (srcCouldBeRemoved) {
|
||||||
//if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
|
var b64starts = elem.src.search(/base64\s*/i) + 7;
|
||||||
//see the nytimes-3 testcase for an example
|
var b64length = elem.src.length - b64starts;
|
||||||
var img = this._doc.createElement("img");
|
if (b64length < 133) {
|
||||||
img.setAttribute(copyTo, attr.value);
|
elem.removeAttribute("src");
|
||||||
elem.appendChild(img);
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
|
||||||
|
if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var j = 0; j < elem.attributes.length; j++) {
|
||||||
|
attr = elem.attributes[j];
|
||||||
|
if (attr.name === "src" || attr.name === "srcset") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
var copyTo = null;
|
||||||
|
if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
|
||||||
|
copyTo = "srcset";
|
||||||
|
} else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
|
||||||
|
copyTo = "src";
|
||||||
|
}
|
||||||
|
if (copyTo) {
|
||||||
|
//if this is an img or picture, set the attribute directly
|
||||||
|
if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
|
||||||
|
elem.setAttribute(copyTo, attr.value);
|
||||||
|
} else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
|
||||||
|
//if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
|
||||||
|
//see the nytimes-3 testcase for an example
|
||||||
|
var img = this._doc.createElement("img");
|
||||||
|
img.setAttribute(copyTo, attr.value);
|
||||||
|
elem.appendChild(img);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1828,12 +2154,18 @@ Readability.prototype = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Unwrap image from noscript
|
||||||
|
this._unwrapNoscriptImages(this._doc);
|
||||||
|
|
||||||
|
// Extract JSON-LD metadata before removing scripts
|
||||||
|
var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
|
||||||
|
|
||||||
// Remove script tags from the document.
|
// Remove script tags from the document.
|
||||||
this._removeScripts(this._doc);
|
this._removeScripts(this._doc);
|
||||||
|
|
||||||
this._prepDocument();
|
this._prepDocument();
|
||||||
|
|
||||||
var metadata = this._getArticleMetadata();
|
var metadata = this._getArticleMetadata(jsonLd);
|
||||||
this._articleTitle = metadata.title;
|
this._articleTitle = metadata.title;
|
||||||
|
|
||||||
var articleContent = this._grabArticle();
|
var articleContent = this._grabArticle();
|
||||||
@ -1859,7 +2191,7 @@ Readability.prototype = {
|
|||||||
title: this._articleTitle,
|
title: this._articleTitle,
|
||||||
byline: metadata.byline || this._articleByline,
|
byline: metadata.byline || this._articleByline,
|
||||||
dir: this._articleDir,
|
dir: this._articleDir,
|
||||||
content: articleContent.innerHTML,
|
content: this._serializer(articleContent),
|
||||||
textContent: textContent,
|
textContent: textContent,
|
||||||
length: textContent.length,
|
length: textContent.length,
|
||||||
excerpt: metadata.excerpt,
|
excerpt: metadata.excerpt,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user