blob: ec801aa44b78b37b56ff17b74de7371f4f25e30f [file] [log] [blame]
/*
* Copyright 2019 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { Request, Response } from 'express';
import puppeteer = require('puppeteer');
import { log } from './logger';
import { ContentNode } from './types';
import { PlainTextFormatter } from './plain_text_formatter';
import { transformUrl } from './url-transforms';
const CHROME_LAUNCH_ARGS = ['--enable-dom-distiller'];
// A list of DOM Node types that are usually not useful in the context
// of fetching text content from the page.
type BannedNames = {
[key: string]: true
};
/**
* Handles the actual license request.
*/
export async function handleRequest(request: Request, response: Response) {
const url = request.body.url;
if (url) {
try {
log(`Handling license request for ${url}`);
if (!isValidProtocol(url)) {
response.status(400).send('Invalid request.');
return;
}
const nodes = await handleLicenseRequest(url);
const content = PlainTextFormatter.plainTextFor(nodes);
response.status(200).send(content);
} catch (error) {
log('Error handling license request ', error);
response.status(400).send('Something bad happened. Check the logs');
}
} else {
response.status(400).send('URL required');
}
}
/**
* Validates the protocol. Only allows `https?` requests.
* @param requestUrl The request url
* @return `true` if the protocol is valid.
*/
function isValidProtocol(requestUrl: string): boolean {
const url = new URL(requestUrl);
if (url.protocol === 'https:') {
// Allow https requests
return true;
} else if (url.protocol === 'http:') {
// Allow http requests
return true;
} else {
log(`Invalid protocol ${url.protocol}`);
return false;
}
}
async function handleLicenseRequest(url: string, enableLocalDebugging: boolean = false): Promise<ContentNode[]> {
const transformed = transformUrl(url);
if (url !== transformed) {
log(`Transformed request url to ${transformed}`);
}
const browser = await puppeteer.launch({
args: CHROME_LAUNCH_ARGS,
devtools: enableLocalDebugging,
// https://developer.chrome.com/articles/new-headless/
headless: true
});
const page = await browser.newPage();
if (enableLocalDebugging) {
page.on('console', (message) => {
log(`Puppeteer: ${message.text()}`);
});
}
await page.goto(transformed, { waitUntil: 'domcontentloaded' });
const content = await page.evaluate(() => {
// A map of banned nodes
const BANNED_LOCAL_NAMES: BannedNames = {
'button': true,
'canvas': true,
'footer': true,
'header': true,
'code': true,
'img': true,
'nav': true,
'script': true,
'style': true,
'svg': true,
};
// node list handler
function contentForNodeList(list: NodeList | null | undefined): ContentNode[] {
const contentNodes: ContentNode[] = [];
if (!list) {
return contentNodes;
}
for (let i = 0; i < list.length; i += 1) {
const node = contentForNode(list.item(i));
if (node) {
contentNodes.push(node);
}
}
return contentNodes;
}
// content handler
const contentWithPath = function (node: ContentNode, accumulator: ContentNode[]) {
if (node.textContent && node.textContent.length > 0) {
accumulator.push({ localName: node.localName, textContent: node.textContent });
}
if (node.children) {
for (let i = 0; i < node.children.length; i += 1) {
contentWithPath(node.children[i], accumulator);
}
}
};
// node handler
function contentForNode(node: Node | null | undefined) {
if (!node) {
return null;
}
const name = node.nodeName.toLowerCase();
// Check if node is banned.
if (name && BANNED_LOCAL_NAMES[name] === true) {
return null;
}
// Shallow clone node, as we are only interested in the textContent
// of the node, and not the child nodes.
const cloned = node.cloneNode();
const localName = name;
// Handle elements of different types
if (cloned instanceof HTMLAnchorElement) {
// anchor element
// Ensure that it has reasonable href content
const href = cloned.href;
if (href.length <= 0 || href === '#') {
return null;
}
}
const textContent = cloned.textContent;
const children = contentForNodeList(node.childNodes);
return {
localName: localName,
textContent: textContent,
children: children
};
}
const body = document.querySelector('body');
const nodes: ContentNode[] =
body == null ? [] : contentForNodeList(body.childNodes);
// Accumulate nodes with content
const accumulator: ContentNode[] = [];
for (let i = 0; i < nodes.length; i += 1) {
const node = nodes[i];
contentWithPath(node, accumulator);
}
return accumulator;
});
await browser.close();
return content;
}