commit 449d0703bdee3a091976ec6d17a307ed0a11cd87 Author: OverPoweredDev Date: Mon Jul 5 17:20:30 2021 +0530 added to webpage translation again I need to think of better commit messages diff --git a/src/lib/translate.js b/src/lib/translate.js index 323c6eb..5878a12 100644 --- a/src/lib/translate.js +++ b/src/lib/translate.js @@ -38,18 +38,87 @@ async function translateWord(inputText, sourceLanguage, targetLanguage) { return outputText; } -async function translateWebpage(sourceLanguage, targetLanguage){ +async function translateWebpage(sourceLanguage, targetLanguage) { console.log('inside translatepage'); - let text; - let arr = [] - //iterate over all elements that are 'parents' - $(":parent").each(function () { - //filter out those that have innertext - text = $(this).ownText(); - if(text.length > 0 && /[a-zA-Z]/.test(text)) { - arr.push(text); - } + + let textElements = []; + + $('body').children().each(function () { + getBlockNodes($(this)[0], textElements); }); - console.log(arr); + textElements = [...new Set(textElements)]; + + console.log(textElements); +} + +// everything below courtesy of https://gist.github.com/TinoDidriksen/c41c33ca5809ff297bf7b1608b3a41e2 +const text_nodes = { + 'ADDRESS': true, 'ARTICLE': true, 'ASIDE': true, 'AUDIO': true, 'BLOCKQUOTE': true, 'BODY': true, + 'CANVAS': true, 'DD': true, 'DIV': true, 'DL': true, 'FIELDSET': true, 'FIGCAPTION': true, 'FIGURE': true, + 'FOOTER': true, 'FORM': true, 'H1': true, 'H2': true, 'H3': true, 'H4': true, 'H5': true, 'H6': true, + 'HEADER': true, 'HGROUP': true, 'HTML': true, 'HR': true, 'MAIN': true, 'NAV': true, + 'NOSCRIPT': true, 'OL': true, 'OUTPUT': true, 'P': true, 'PRE': true, 'SECTION': true, 'TABLE': true, + 'TD': true, 'TH': true, 'UL': true, 'VIDEO': true +}; + +function getBlockNodes(body, uniqueTextNodesList) { + let textNodes = findTextNodes(body); + let blockNodesList = []; + for (let i = 0; i < textNodes.length; ++i) { + let n = textNodes[i]; + do { + n = n.parentNode; + } while (n && n.parentNode && !text_nodes.hasOwnProperty(n.nodeName)); + + // Only add unseen parent nodes + if (blockNodesList.indexOf(n) === -1) { + blockNodesList.push(n); + } + } + + // Deduplicate found parent nodes, and mark the unique ones for tracking + for (let i = 0; i < blockNodesList.length; ++i) { + let p = blockNodesList[i]; + do { + p = p.parentNode; + if (blockNodesList.indexOf(p) !== -1) { + // console.log(['Skipping node with a parent already in the set', blockNodesList[i]]); + blockNodesList[i] = null; + break; + } + } while (p && p.parentNode); + + if (blockNodesList[i]) { + blockNodesList[i].setAttribute('data-replace-id', uniqueTextNodesList.length); + uniqueTextNodesList.push(blockNodesList[i]); + } + } + + return uniqueTextNodesList; +} + +function findTextNodes(nodesList) { + let textNodesList = [], noWhitespace = /\S/; + + if (!$.isArray(nodesList)) { + nodesList = [nodesList]; + } + + function _findTextNodes(node) { + if (node.nodeType === 3) { + if (noWhitespace.test(node.nodeValue)) { + textNodesList.push(node); + } + } else { + for (let i = 0; i < node.childNodes.length; ++i) { + _findTextNodes(node.childNodes[i]); + } + } + } + + for (let i = 0; i < nodesList.length; ++i) { + _findTextNodes(nodesList[i]); + } + return textNodesList; } \ No newline at end of file