tenfourfox/browser/components/translation/TranslationDocument.jsm
Cameron Kaiser c9b2922b70 hello FPR
2017-04-19 00:56:45 -07:00

681 lines
23 KiB
JavaScript

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";
const {classes: Cc, interfaces: Ci, utils: Cu} = Components;
this.EXPORTED_SYMBOLS = [ "TranslationDocument" ];
const SHOW_ELEMENT = Ci.nsIDOMNodeFilter.SHOW_ELEMENT;
const SHOW_TEXT = Ci.nsIDOMNodeFilter.SHOW_TEXT;
const TEXT_NODE = Ci.nsIDOMNode.TEXT_NODE;
Cu.import("resource://services-common/utils.js");
Cu.import("resource://gre/modules/Task.jsm");
/**
* This class represents a document that is being translated,
* and it is responsible for parsing the document,
* generating the data structures translation (the list of
* translation items and roots), and managing the original
* and translated texts on the translation items.
*
* @param document The document to be translated
*/
this.TranslationDocument = function(document) {
this.itemsMap = new Map();
this.roots = [];
this._init(document);
};
this.TranslationDocument.prototype = {
translatedFrom: "",
translatedTo: "",
translationError: false,
originalShown: true,
/**
* Initializes the object and populates
* the roots lists.
*
* @param document The document to be translated
*/
_init: function(document) {
let window = document.defaultView;
let winUtils = window.QueryInterface(Ci.nsIInterfaceRequestor)
.getInterface(Ci.nsIDOMWindowUtils);
// Get all the translation nodes in the document's body:
// a translation node is a node from the document which
// contains useful content for translation, and therefore
// must be included in the translation process.
let nodeList = winUtils.getTranslationNodes(document.body);
let length = nodeList.length;
for (let i = 0; i < length; i++) {
let node = nodeList.item(i);
let isRoot = nodeList.isTranslationRootAtIndex(i);
// Create a TranslationItem object for this node.
// This function will also add it to the this.roots array.
this._createItemForNode(node, i, isRoot);
}
// At first all roots are stored in the roots list, and only after
// the process has finished we're able to determine which roots are
// simple, and which ones are not.
// A simple root is defined by a root with no children items, which
// basically represents an element from a page with only text content
// inside.
// This distinction is useful for optimization purposes: we treat a
// simple root as plain-text in the translation process and with that
// we are able to reduce their data payload sent to the translation service.
for (let root of this.roots) {
if (root.children.length == 0 &&
root.nodeRef.childElementCount == 0) {
root.isSimpleRoot = true;
}
}
},
/**
* Creates a TranslationItem object, which should be called
* for each node returned by getTranslationNodes.
*
* @param node The DOM node for this item.
* @param id A unique, numeric id for this item.
* @parem isRoot A boolean saying whether this item is a root.
*
* @returns A TranslationItem object.
*/
_createItemForNode: function(node, id, isRoot) {
if (this.itemsMap.has(node)) {
return this.itemsMap.get(node);
}
let item = new TranslationItem(node, id, isRoot);
if (isRoot) {
// Root items do not have a parent item.
this.roots.push(item);
} else {
let parentItem = this.itemsMap.get(node.parentNode);
if (parentItem) {
parentItem.children.push(item);
}
}
this.itemsMap.set(node, item);
return item;
},
/**
* Generate the text string that represents a TranslationItem object.
* Besides generating the string, it's also stored in the "original"
* field of the TranslationItem object, which needs to be stored for
* later to be used in the "Show Original" functionality.
* If this function had already been called for the given item (determined
* by the presence of the "original" array in the item), the text will
* be regenerated from the "original" data instead of from the related
* DOM nodes (because the nodes might contain translated data).
*
* @param item A TranslationItem object
*
* @returns A string representation of the TranslationItem.
*/
generateTextForItem: function(item) {
if (item.original) {
return regenerateTextFromOriginalHelper(item);
}
if (item.isSimpleRoot) {
let text = item.nodeRef.firstChild.nodeValue.trim();
item.original = [text];
return text;
}
let str = "";
item.original = [];
let wasLastItemPlaceholder = false;
for (let child of item.nodeRef.childNodes) {
if (child.nodeType == TEXT_NODE) {
let x = child.nodeValue.trim();
if (x != "") {
item.original.push(x);
str += x;
wasLastItemPlaceholder = false;
}
continue;
}
let objInMap = this.itemsMap.get(child);
if (objInMap && !objInMap.isRoot) {
// If this childNode is present in the itemsMap, it means
// it's a translation node: it has useful content for translation.
// In this case, we need to stringify this node.
// However, if this item is a root, we should skip it here in this
// object's child list (and just add a placeholder for it), because
// it will be stringfied separately for being a root.
item.original.push(objInMap);
str += this.generateTextForItem(objInMap);
wasLastItemPlaceholder = false;
} else {
// Otherwise, if this node doesn't contain any useful content,
// or if it is a root itself, we can replace it with a placeholder node.
// We can't simply eliminate this node from our string representation
// because that could change the HTML structure (e.g., it would
// probably merge two separate text nodes).
// It's not necessary to add more than one placeholder in sequence;
// we can optimize them away.
if (!wasLastItemPlaceholder) {
item.original.push(TranslationItem_NodePlaceholder);
str += '<br>';
wasLastItemPlaceholder = true;
}
}
}
return generateTranslationHtmlForItem(item, str);
},
/**
* Changes the document to display its translated
* content.
*/
showTranslation: function() {
this.originalShown = false;
this._swapDocumentContent("translation");
},
/**
* Changes the document to display its original
* content.
*/
showOriginal: function() {
this.originalShown = true;
this._swapDocumentContent("original");
},
/**
* Swap the document with the resulting translation,
* or back with the original content.
*
* @param target A string that is either "translation"
* or "original".
*/
_swapDocumentContent: function(target) {
Task.spawn(function *() {
// Let the event loop breath on every 100 nodes
// that are replaced.
const YIELD_INTERVAL = 100;
let count = YIELD_INTERVAL;
for (let root of this.roots) {
root.swapText(target);
if (count-- == 0) {
count = YIELD_INTERVAL;
yield CommonUtils.laterTickResolvingPromise();
}
}
}.bind(this));
}
};
/**
* This class represents an item for translation. It's basically our
* wrapper class around a node returned by getTranslationNode, with
* more data and structural information on it.
*
* At the end of the translation process, besides the properties below,
* a TranslationItem will contain two other properties: one called "original"
* and one called "translation". They are twin objects, one which reflect
* the structure of that node in its original state, and the other in its
* translated state.
*
* The "original" array is generated in the generateTextForItem function,
* and the "translation" array is generated when the translation results
* are parsed.
*
* They are both arrays, which contain a mix of strings and references to
* child TranslationItems. The references in both arrays point to the * same *
* TranslationItem object, but they might appear in different orders between the
* "original" and "translation" arrays.
*
* An example:
*
* English: <div id="n1">Welcome to <b id="n2">Mozilla's</b> website</div>
* Portuguese: <div id="n1">Bem vindo a pagina <b id="n2">da Mozilla</b></div>
*
* TranslationItem n1 = {
* id: 1,
* original: ["Welcome to", ptr to n2, "website"]
* translation: ["Bem vindo a pagina", ptr to n2]
* }
*
* TranslationItem n2 = {
* id: 2,
* original: ["Mozilla's"],
* translation: ["da Mozilla"]
* }
*/
function TranslationItem(node, id, isRoot) {
this.nodeRef = node;
this.id = id;
this.isRoot = isRoot;
this.children = [];
}
TranslationItem.prototype = {
isRoot: false,
isSimpleRoot: false,
toString: function() {
let rootType = this.isRoot
? (this.isSimpleRoot ? ' (simple root)' : ' (non simple root)')
: '';
return "[object TranslationItem: <" + this.nodeRef.localName + ">"
+ rootType + "]";
},
/**
* This function will parse the result of the translation of one translation
* item. If this item was a simple root, all we sent was a plain-text version
* of it, so the result is also straightforward text.
*
* For non-simple roots, we sent a simplified HTML representation of that
* node, and we'll first parse that into an HTML doc and then call the
* parseResultNode helper function to parse it.
*
* While parsing, the result is stored in the "translation" field of the
* TranslationItem, which will be used to display the final translation when
* all items are finished. It remains stored too to allow back-and-forth
* switching between the "Show Original" and "Show Translation" functions.
*
* @param result A string with the textual result received from the server,
* which can be plain-text or a serialized HTML doc.
*/
parseResult: function(result) {
if (this.isSimpleRoot) {
this.translation = [result];
return;
}
let domParser = Cc["@mozilla.org/xmlextras/domparser;1"]
.createInstance(Ci.nsIDOMParser);
let doc = domParser.parseFromString(result, "text/html");
parseResultNode(this, doc.body.firstChild);
},
/**
* This function finds a child TranslationItem
* with the given id.
* @param id The id to look for, in the format "n#"
* @returns A TranslationItem with the given id, or null if
* it was not found.
*/
getChildById: function(id) {
for (let child of this.children) {
if (("n" + child.id) == id) {
return child;
}
}
return null;
},
/**
* Swap the text of this TranslationItem between
* its original and translated states.
*
* @param target A string that is either "translation"
* or "original".
*/
swapText: function(target) {
swapTextForItem(this, target);
}
};
/**
* This object represents a placeholder item for translation. It's similar to
* the TranslationItem class, but it represents nodes that have no meaningful
* content for translation. These nodes will be replaced by "<br>" in a
* translation request. It's necessary to keep them to use it as a mark
* for correct positioning and spliting of text nodes.
*/
const TranslationItem_NodePlaceholder = {
toString: function() {
return "[object TranslationItem_NodePlaceholder]";
}
};
/**
* Generate the outer HTML representation for a given item.
*
* @param item A TranslationItem object.
* param content The inner content for this item.
* @returns string The outer HTML needed for translation
* of this item.
*/
function generateTranslationHtmlForItem(item, content) {
let localName = item.isRoot ? "div" : "b";
return '<' + localName + ' id=n' + item.id + '>' +
content +
"</" + localName + ">";
}
/**
* Regenerate the text string that represents a TranslationItem object,
* with data from its "original" array. The array must have already
* been created by TranslationDocument.generateTextForItem().
*
* @param item A TranslationItem object
*
* @returns A string representation of the TranslationItem.
*/
function regenerateTextFromOriginalHelper(item) {
if (item.isSimpleRoot) {
return item.original[0];
}
let str = "";
for (let child of item.original) {
if (child instanceof TranslationItem) {
str += regenerateTextFromOriginalHelper(child);
} else if (child === TranslationItem_NodePlaceholder) {
str += "<br>";
} else {
str += child;
}
}
return generateTranslationHtmlForItem(item, str);
}
/**
* Helper function to parse a HTML doc result.
* How it works:
*
* An example result string is:
*
* <div id="n1">Hello <b id="n2">World</b> of Mozilla.</div>
*
* For an element node, we look at its id and find the corresponding
* TranslationItem that was associated with this node, and then we
* walk down it repeating the process.
*
* For text nodes we simply add it as a string.
*/
function parseResultNode(item, node) {
item.translation = [];
for (let child of node.childNodes) {
if (child.nodeType == TEXT_NODE) {
item.translation.push(child.nodeValue);
} else if (child.localName == "br") {
item.translation.push(TranslationItem_NodePlaceholder);
} else {
let translationItemChild = item.getChildById(child.id);
if (translationItemChild) {
item.translation.push(translationItemChild);
parseResultNode(translationItemChild, child);
}
}
}
}
/**
* Helper function to swap the text of a TranslationItem
* between its original and translated states.
* How it works:
*
* The function iterates through the target array (either the `original` or
* `translation` array from the TranslationItem), while also keeping a pointer
* to a current position in the child nodes from the actual DOM node that we
* are modifying. This pointer is moved forward after each item of the array
* is translated. If, at any given time, the pointer doesn't match the expected
* node that was supposed to be seen, it means that the original and translated
* contents have a different ordering, and thus we need to adjust that.
*
* A full example of the reordering process, swapping from Original to
* Translation:
*
* Original (en): <div>I <em>miss</em> <b>you</b></div>
*
* Translation (fr): <div><b>Tu</b> me <em>manques</em></div>
*
* Step 1:
* pointer points to firstChild of the DOM node, textnode "I "
* first item in item.translation is [object TranslationItem <b>]
*
* pointer does not match the expected element, <b>. So let's move <b> to the
* pointer position.
*
* Current state of the DOM:
* <div><b>you</b>I <em>miss</em> </div>
*
* Step 2:
* pointer moves forward to nextSibling, textnode "I " again.
* second item in item.translation is the string " me "
*
* pointer points to a text node, and we were expecting a text node. Match!
* just replace the text content.
*
* Current state of the DOM:
* <div><b>you</b> me <em>miss</em> </div>
*
* Step 3:
* pointer moves forward to nextSibling, <em>miss</em>
* third item in item.translation is [object TranslationItem <em>]
*
* pointer points to the expected node. Match! Nothing to do.
*
* Step 4:
* all items in this item.translation were transformed. The remaining
* text nodes are cleared to "", and domNode.normalize() removes them.
*
* Current state of the DOM:
* <div><b>you</b> me <em>miss</em></div>
*
* Further steps:
* After that, the function will visit the child items (from the visitStack),
* and the text inside the <b> and <em> nodes will be swapped as well,
* yielding the final result:
*
* <div><b>Tu</b> me <em>manques</em></div>
*
*
* @param item A TranslationItem object
* @param target A string that is either "translation"
* or "original".
*/
function swapTextForItem(item, target) {
// visitStack is the stack of items that we still need to visit.
// Let's start the process by adding the root item.
let visitStack = [ item ];
let source = target == "translation" ? "original" : "translation";
while (visitStack.length > 0) {
let curItem = visitStack.shift();
let domNode = curItem.nodeRef;
if (!domNode) {
// Skipping this item due to a missing node.
continue;
}
if (!curItem[target]) {
// Translation not found for this item. This could be due to
// an error in the server response. For example, if a translation
// was broken in various chunks, and one of the chunks failed,
// the items from that chunk will be missing its "translation"
// field.
continue;
}
domNode.normalize();
// curNode points to the child nodes of the DOM node that we are
// modifying. During most of the process, while the target array is
// being iterated (in the for loop below), it should walk together with
// the array and be pointing to the correct node that needs to modified.
// If it's not pointing to it, that means some sort of node reordering
// will be necessary to produce the correct translation.
// Note that text nodes don't need to be reordered, as we can just replace
// the content of one text node with another.
//
// curNode starts in the firstChild...
let curNode = domNode.firstChild;
// ... actually, let's make curNode start at the first useful node (either
// a non-blank text node or something else). This is not strictly necessary,
// as the reordering algorithm would correctly handle this case. However,
// this better aligns the resulting translation with the DOM content of the
// page, avoiding cases that would need to be unecessarily reordered.
//
// An example of how this helps:
//
// ---- Original: <div> <b>Hello </b> world.</div>
// ^textnode 1 ^item 1 ^textnode 2
//
// - Translation: <div><b>Hallo </b> Welt.</div>
//
// Transformation process without this optimization:
// 1 - start pointer at textnode 1
// 2 - move item 1 to first position inside the <div>
//
// Node now looks like: <div><b>Hello </b>[ ][ world.]</div>
// textnode 1^ ^textnode 2
//
// 3 - replace textnode 1 with " Welt."
// 4 - clear remaining text nodes (in this case, textnode 2)
//
// Transformation process with this optimization:
// 1 - start pointer at item 1
// 2 - item 1 is already in position
// 3 - replace textnode 2 with " Welt."
//
// which completely avoids any node reordering, and requires only one
// text change instead of two (while also leaving the page closer to
// its original state).
while (curNode &&
curNode.nodeType == TEXT_NODE &&
curNode.nodeValue.trim() == "") {
curNode = curNode.nextSibling;
}
// Now let's walk through all items in the `target` array of the
// TranslationItem. This means either the TranslationItem.original or
// TranslationItem.translation array.
for (let targetItem of curItem[target]) {
if (targetItem instanceof TranslationItem) {
// If the array element is another TranslationItem object, let's
// add it to the stack to be visited.
visitStack.push(targetItem);
let targetNode = targetItem.nodeRef;
// If the node is not in the expected position, let's reorder
// it into position...
if (curNode != targetNode &&
// ...unless the page has reparented this node under a totally
// different node (or removed it). In this case, all bets are off
// on being able to do anything correctly, so it's better not to
// bring back the node to this parent.
targetNode.parentNode == domNode) {
// We don't need to null-check curNode because insertBefore(..., null)
// does what we need in that case: reorder this node to the end
// of child nodes.
domNode.insertBefore(targetNode, curNode);
curNode = targetNode;
}
// Move pointer forward. Since we do not add empty text nodes to the
// list of translation items, we must skip them here too while
// traversing the DOM in order to get better alignment between the
// text nodes and the translation items.
if (curNode) {
curNode = getNextSiblingSkippingEmptyTextNodes(curNode);
}
} else if (targetItem === TranslationItem_NodePlaceholder) {
// If the current item is a placeholder node, we need to move
// our pointer "past" it, jumping from one side of a block of
// elements + empty text nodes to the other side. Even if
// non-placeholder elements exists inside the jumped block,
// they will be pulled correctly later in the process when the
// targetItem for those nodes are handled.
while (curNode &&
(curNode.nodeType != TEXT_NODE ||
curNode.nodeValue.trim() == "")) {
curNode = curNode.nextSibling;
}
} else {
// Finally, if it's a text item, we just need to find the next
// text node to use. Text nodes don't need to be reordered, so
// the first one found can be used.
while (curNode && curNode.nodeType != TEXT_NODE) {
curNode = curNode.nextSibling;
}
// If none was found and we reached the end of the child nodes,
// let's create a new one.
if (!curNode) {
// We don't know if the original content had a space or not,
// so the best bet is to create the text node with " " which
// will add one space at the beginning and one at the end.
curNode = domNode.appendChild(domNode.ownerDocument.createTextNode(" "));
}
// A trailing and a leading space must be preserved because
// they are meaningful in HTML.
let preSpace = /^\s/.test(curNode.nodeValue) ? " " : "";
let endSpace = /\s$/.test(curNode.nodeValue) ? " " : "";
curNode.nodeValue = preSpace + targetItem + endSpace;
curNode = getNextSiblingSkippingEmptyTextNodes(curNode);
}
}
// The translated version of a node might have less text nodes than its
// original version. If that's the case, let's clear the remaining nodes.
if (curNode) {
clearRemainingNonEmptyTextNodesFromElement(curNode);
}
// And remove any garbage "" nodes left after clearing.
domNode.normalize();
}
}
function getNextSiblingSkippingEmptyTextNodes(startSibling) {
let item = startSibling.nextSibling;
while (item &&
item.nodeType == TEXT_NODE &&
item.nodeValue.trim() == "") {
item = item.nextSibling;
}
return item;
}
function clearRemainingNonEmptyTextNodesFromElement(startSibling) {
let item = startSibling;
while (item) {
if (item.nodeType == TEXT_NODE &&
item.nodeValue != "") {
item.nodeValue = "";
}
item = item.nextSibling;
}
}