diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js index 30c7d4cd9..7bfa2acf5 100644 --- a/toolkit/components/reader/JSDOMParser.js +++ b/toolkit/components/reader/JSDOMParser.js @@ -278,7 +278,7 @@ var whitespace = [" ", "\t", "\n", "\r"]; - // See http://www.w3schools.com/dom/dom_nodetype.asp + // See https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType var nodeTypes = { ELEMENT_NODE: 1, ATTRIBUTE_NODE: 2, @@ -705,7 +705,6 @@ } // Using Array.join() avoids the overhead from lazy string concatenation. - // See http://blog.cdleary.com/2012/01/string-representation-in-spidermonkey/#ropes var arr = []; getHTML(this); return arr.join(""); @@ -875,7 +874,11 @@ JSDOMParser.prototype = { error: function(m) { - dump("JSDOMParser error: " + m + "\n"); + if (typeof dump !== "undefined") { + dump("JSDOMParser error: " + m + "\n"); + } else if (typeof console !== "undefined") { + console.log("JSDOMParser error: " + m + "\n"); + } this.errorState += m + "\n"; }, @@ -1187,3 +1190,7 @@ global.JSDOMParser = JSDOMParser; })(this); + +if (typeof module === "object") { + module.exports = this.JSDOMParser; +} diff --git a/toolkit/components/reader/Readability-readerable.js b/toolkit/components/reader/Readability-readerable.js index 650f7f35a..f5df709a8 100644 --- a/toolkit/components/reader/Readability-readerable.js +++ b/toolkit/components/reader/Readability-readerable.js @@ -1,5 +1,4 @@ /* eslint-env es6:false */ -/* globals exports */ /* * Copyright (c) 2010 Arc90 Inc * @@ -95,6 +94,6 @@ function isProbablyReaderable(doc, isVisible) { }); } -if (typeof exports === "object") { - exports.isProbablyReaderable = isProbablyReaderable; +if (typeof module === "object") { + module.exports = isProbablyReaderable; } diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js index 30938791f..e4a9067f5 100644 --- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -50,6 +50,10 @@ function Readability(doc, options) { this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); this._keepClasses = !!options.keepClasses; + this._serializer = options.serializer || function(el) { + return el.innerHTML; + }; + this._disableJSONLD = !!options.disableJSONLD; // Start with all flags set this._flags = this.FLAG_STRIP_UNLIKELYS | @@ -131,8 +135,14 @@ Readability.prototype = { prevLink: /(prev|earl|old|new|<|«)/i, whitespace: /^\s*$/, hasContent: /\S$/, + srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, + b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, + // See: https://schema.org/Article + jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ }, + UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ], + DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], @@ -155,6 +165,15 @@ Readability.prototype = { // These are the classes that readability sets itself. CLASSES_TO_PRESERVE: [ "page" ], + // These are the list of HTML entities that need to be escaped. + HTML_ESCAPE_MAP: { + "lt": "<", + "gt": ">", + "amp": "&", + "quot": '"', + "apos": "'", + }, + /** * Run any post-process modifications to article content as necessary. * @@ -165,6 +184,8 @@ Readability.prototype = { // Readability cannot open relative uris so we convert them to absolute uris. this._fixRelativeUris(articleContent); + this._simplifyNestedElements(articleContent); + if (!this._keepClasses) { // Remove classes. this._cleanClasses(articleContent); @@ -230,6 +251,21 @@ Readability.prototype = { Array.prototype.forEach.call(nodeList, fn, this); }, + /** + * Iterate over a NodeList, and return the first node that passes + * the supplied test function + * + * For convenience, the current object context is applied to the provided + * test function. + * + * @param NodeList nodeList The NodeList. + * @param Function fn The test function. + * @return void + */ + _findNode: function(nodeList, fn) { + return Array.prototype.find.call(nodeList, fn, this); + }, + /** * Iterate over a NodeList, return true if any of the provided iterate * function calls returns true, false otherwise. @@ -328,6 +364,7 @@ Readability.prototype = { if (baseURI == documentURI && uri.charAt(0) == "#") { return uri; } + // Otherwise, resolve against base URI: try { return new URL(uri, baseURI).href; @@ -362,15 +399,56 @@ Readability.prototype = { } }); - var imgs = this._getAllNodesWithTag(articleContent, ["img"]); - this._forEachNode(imgs, function(img) { - var src = img.getAttribute("src"); + var medias = this._getAllNodesWithTag(articleContent, [ + "img", "picture", "figure", "video", "audio", "source" + ]); + + this._forEachNode(medias, function(media) { + var src = media.getAttribute("src"); + var poster = media.getAttribute("poster"); + var srcset = media.getAttribute("srcset"); + if (src) { - img.setAttribute("src", toAbsoluteURI(src)); + media.setAttribute("src", toAbsoluteURI(src)); + } + + if (poster) { + media.setAttribute("poster", toAbsoluteURI(poster)); + } + + if (srcset) { + var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { + return toAbsoluteURI(p1) + (p2 || "") + p3; + }); + + media.setAttribute("srcset", newSrcset); } }); }, + _simplifyNestedElements: function(articleContent) { + var node = articleContent; + + while (node) { + if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) { + if (this._isElementWithoutContent(node)) { + node = this._removeAndGetNext(node); + continue; + } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) { + var child = node.children[0]; + for (var i = 0; i < node.attributes.length; i++) { + child.setAttribute(node.attributes[i].name, node.attributes[i].value); + } + node.parentNode.replaceChild(child, node); + node = child; + continue; + } + } + + node = this._getNextNode(node); + } + }, + /** * Get the article title as an H1. * @@ -840,6 +918,12 @@ Readability.prototype = { node = this._removeAndGetNext(node); continue; } + + if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { + this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); + node = this._removeAndGetNext(node); + continue; + } } // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). @@ -913,7 +997,7 @@ Readability.prototype = { return; // Exclude nodes with no ancestor. - var ancestors = this._getNodeAncestors(elementToScore, 3); + var ancestors = this._getNodeAncestors(elementToScore, 5); if (ancestors.length === 0) return; @@ -1233,12 +1317,111 @@ Readability.prototype = { return false; }, + /** + * Converts some of the common HTML entities in string to their corresponding characters. + * + * @param str {string} - a string to unescape. + * @return string without HTML entity. + */ + _unescapeHtmlEntities: function(str) { + if (!str) { + return str; + } + + var htmlEscapeMap = this.HTML_ESCAPE_MAP; + return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { + return htmlEscapeMap[tag]; + }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); + return String.fromCharCode(num); + }); + }, + + /** + * Try to extract metadata from JSON-LD object. + * For now, only Schema.org objects of type Article or its subtypes are supported. + * @return Object with any metadata that could be extracted (possibly none) + */ + _getJSONLD: function (doc) { + var scripts = this._getAllNodesWithTag(doc, ["script"]); + + var jsonLdElement = this._findNode(scripts, function(el) { + return el.getAttribute("type") === "application/ld+json"; + }); + + if (jsonLdElement) { + try { + // Strip CDATA markers if present + var content = jsonLdElement.textContent.replace(/^\s*\s*$/g, ""); + var parsed = JSON.parse(content); + var metadata = {}; + if ( + !parsed["@context"] || + !parsed["@context"].match(/^https?\:\/\/schema\.org$/) + ) { + return metadata; + } + + if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { + parsed = parsed["@graph"].find(function(it) { + return (it["@type"] || "").match( + this.REGEXPS.jsonLdArticleTypes + ); + }); + } + + if ( + !parsed || + !parsed["@type"] || + !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) + ) { + return metadata; + } + if (typeof parsed.name === "string") { + metadata.title = parsed.name.trim(); + } else if (typeof parsed.headline === "string") { + metadata.title = parsed.headline.trim(); + } + if (parsed.author) { + if (typeof parsed.author.name === "string") { + metadata.byline = parsed.author.name.trim(); + } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") { + metadata.byline = parsed.author + .filter(function(author) { + return author && typeof author.name === "string"; + }) + .map(function(author) { + return author.name.trim(); + }) + .join(", "); + } + } + if (typeof parsed.description === "string") { + metadata.excerpt = parsed.description.trim(); + } + if ( + parsed.publisher && + typeof parsed.publisher.name === "string" + ) { + metadata.siteName = parsed.publisher.name.trim(); + } + return metadata; + } catch (err) { + this.log(err.message); + } + } + return {}; + }, + /** * Attempts to get excerpt and byline metadata for the article. * + * @param {Object} jsonld — object containing any metadata that + * could be extracted from JSON-LD object. + * * @return Object with optional "excerpt" and "byline" properties */ - _getArticleMetadata: function() { + _getArticleMetadata: function(jsonld) { var metadata = {}; var values = {}; var metaElements = this._doc.getElementsByTagName("meta"); @@ -1284,7 +1467,8 @@ Readability.prototype = { }); // get title - metadata.title = values["dc:title"] || + metadata.title = jsonld.title || + values["dc:title"] || values["dcterm:title"] || values["og:title"] || values["weibo:article:title"] || @@ -1297,12 +1481,14 @@ Readability.prototype = { } // get author - metadata.byline = values["dc:creator"] || + metadata.byline = jsonld.byline || + values["dc:creator"] || values["dcterm:creator"] || values["author"]; // get description - metadata.excerpt = values["dc:description"] || + metadata.excerpt = jsonld.excerpt || + values["dc:description"] || values["dcterm:description"] || values["og:description"] || values["weibo:article:description"] || @@ -1311,11 +1497,114 @@ Readability.prototype = { values["twitter:description"]; // get site name - metadata.siteName = values["og:site_name"]; + metadata.siteName = jsonld.siteName || + values["og:site_name"]; + + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + metadata.title = this._unescapeHtmlEntities(metadata.title); + metadata.byline = this._unescapeHtmlEntities(metadata.byline); + metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); + metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); return metadata; }, + /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param Element + **/ + _isSingleImage: function(node) { + if (node.tagName === "IMG") { + return true; + } + + if (node.children.length !== 1 || node.textContent.trim() !== "") { + return false; + } + + return this._isSingleImage(node.children[0]); + }, + + /** + * Find all