Enable URL rewriting for Github license requests.
* Also enables the new headless mode in Puppeteer.
* Adding a `.puppeteerrc.cjs` to make sure App Engine Standard correctly downloads the correct version of Puppeteer.
Test: curl -d '{"url": "https://github.com/google/desugar_jdk_libs/blob/master/LICENSE"}' -H 'Content-Type: application/json' -X POST 'http://localhost:8080/convert/licenses'
Fixes: b/285652313
Change-Id: I8fdc0ea21008dd4cabd5d81b28bf7bfa3ddc8079
diff --git a/development/fetchLicenses/url-transforms.ts b/development/fetchLicenses/url-transforms.ts
new file mode 100644
index 0000000..85b2430
--- /dev/null
+++ b/development/fetchLicenses/url-transforms.ts
@@ -0,0 +1,39 @@
+export function transformUrl(url: string): string {
+ if (isGitHub(url)) {
+ // Transform https://github.com URLs to https://raw.githubusercontent.com
+ // because GitHub applies DDos protection which prevents us from being
+ // able to pull the contents of the LICENSE file.
+ return rawGithubUrl(url);
+ }
+ return url;
+}
+
+function rawGithubUrl(url: string): string {
+ // Transform URL
+ const ignoreSet = new Set<string>(['https:', 'github.com', 'blob']);
+ const tokens = url.split('/');
+ const repo = [];
+ const path = [];
+ let pathStarted = false;
+ for (let i = 0; i < tokens.length; i += 1) {
+ if (tokens[i].length <= 0) {
+ continue;
+ }
+ if (tokens[i] === 'blob') {
+ pathStarted = true;
+ }
+ if (ignoreSet.has(tokens[i])) {
+ continue;
+ }
+ if (!pathStarted) {
+ repo.push(tokens[i]);
+ } else {
+ path.push(tokens[i]);
+ }
+ }
+ return `https://raw.githubusercontent.com/${repo.join('/')}/${path.join('/')}`;
+}
+
+function isGitHub(url: string): boolean {
+ return url.startsWith("https://github.com")
+}