gildas-lormeau / single-file-cli

CLI tool for saving a faithful copy of a complete web page in a single HTML file (based on SingleFile)
GNU Affero General Public License v3.0
644 stars 63 forks source link

Add ability to intercept requests to certain domains from a webpage #114

Closed sdht0 closed 2 months ago

sdht0 commented 3 months ago

In an earlier version of single-file-cli which used puppeteer, I was able to locally implement a simple domain blocker like so:

await page.setRequestInterception(true)

page.on('request', interceptedRequest => {
    var host  = new URL(interceptedRequest.url()).host;
    if ((options.pageDomainAllowlist && options.pageDomainAllowlist.includes(host)) ||
            options.pageDomainBlocklist && options.pageDomainBlocklist.length > 0 && !options.pageDomainBlocklist.includes(host)) {
        interceptedRequest.continue()
    } else {
        interceptedRequest.abort()
    }
})

This allows me to greatly reduce the time it takes a save certain websites by blocking all 3rd-party domains, for instance.

However, I have not yet been able to port this to the latest code. Is there a simple way to do this in deno?

gildas-lormeau commented 3 months ago

First, to clear up any misunderstanding, the code in the sinlgle file can be executed in Node.js or Deno. However, I've removed the puppeteer dependency and use the Chome DevTools protocol directly with this library.

Today, you would have to update the following line of code and probably remove resourceType: "Document"

https://github.com/gildas-lormeau/single-file-cli/blob/adad84bab1a6f48820f1aab5492346a70f87edb4/lib/cdp-client.js#L109

Then, you would have to add you logic before the code below in order to block unwanted requests/responses.

https://github.com/gildas-lormeau/single-file-cli/blob/adad84bab1a6f48820f1aab5492346a70f87edb4/lib/cdp-client.js#L153

See here for the documentation of the API: https://chromedevtools.github.io/devtools-protocol/tot/Fetch/

Let me know if you need more information.

sdht0 commented 2 months ago

Thanks a lot for the pointers. I managed to implement it as below.

This allows one to block say all 3rd party domains, e.g., ./single-file --page-domain-allowlist=www.sciencealert.com 'https://www.sciencealert.com/stunning-new-color-map-of-mars-reveals-surface-in-unprecedented-detail'

(Also implemented a way to exit on HTTP or other errors with different exit codes.)

Happy to open a PR if you think the domain blocking feature will be generally useful.

diff --git a/lib/cdp-client.js b/lib/cdp-client.js
index 4091692..0b7c1b1 100644
--- a/lib/cdp-client.js
+++ b/lib/cdp-client.js
@@ -27,6 +27,8 @@ import { launchBrowser, closeBrowser } from "./browser.js";
 import { getScriptSource, getHookScriptSource } from "./single-file-script.js";
 import { CDP, options } from "simple-cdp";

+import process from 'node:process';
+
 const LOAD_TIMEOUT_ERROR = "ERR_LOAD_TIMEOUT";
 const NETWORK_STATES = ["InteractiveTime", "networkIdle", "networkAlmostIdle", "load", "DOMContentLoaded"];
 const MINIMIZED_WINDOW_STATE = "minimized";
@@ -106,7 +108,7 @@ async function getPageData(options) {
            }
        }
        const handleAuthRequests = Boolean(options.httpProxyUsername);
-       const patterns = handleAuthRequests ? [{ requestStage: "Request" }, { requestStage: "Response", resourceType: "Document" }] : [{ requestStage: "Response", resourceType: "Document" }];
+       const patterns = handleAuthRequests ? [{ requestStage: "Request" }, { requestStage: "Response", resourceType: "Document" }] : [{ requestStage: "Request" }, { requestStage: "Response", resourceType: "Document" }];
        await Fetch.enable({ handleAuthRequests, patterns });
        if (handleAuthRequests) {
            Fetch.addEventListener("authRequired", ({ params }) => Fetch.continueWithAuth({
@@ -127,33 +129,43 @@ async function getPageData(options) {
        let httpInfo;
        Fetch.addEventListener("requestPaused", async ({ params }) => {
            const { requestId, request, resourceType, responseHeaders, responseStatusCode, responseStatusText } = params;
-           if (options.outputJson && !httpInfo && (request.url == url || request.url == alternativeUrl) && responseStatusCode !== undefined) {
-               if (REDIRECT_STATUS_CODES.includes(responseStatusCode)) {
-                   const redirect = responseHeaders.find(header => header.name.toLowerCase() == "location").value;
-                   if (redirect) {
-                       url = new URL(redirect, url).href;
-                   }
-               } else {
-                   httpInfo = {
-                       request: {
-                           url: request.url,
-                           method: request.method,
-                           headers: request.headers,
-                           referrerPolicy: request.referrerPolicy
-                       },
-                       resourceType,
-                       response: {
-                           status: responseStatusCode,
-                           statusText: responseStatusText,
-                           headers: responseHeaders
+           const host = new URL(request.url).host;
+           if (!((options.pageDomainAllowlists && options.pageDomainAllowlists.includes(host)) ||
+                   (options.pageDomainBlocklists && options.pageDomainBlocklists.length > 0 && !options.pageDomainBlocklists.includes(host)))) {
+               await Fetch.failRequest({ requestId, errorReason: "ConnectionRefused"});
+           } else {
+               if (responseStatusCode !== undefined && responseStatusCode >= 400 ) {
+                   console.log(`Error: code:${responseStatusCode} for '${request.url}'`);
+                   process.exit(2);
+               }
+               if (options.outputJson && !httpInfo && (request.url == url || request.url == alternativeUrl) && responseStatusCode !== undefined) {
+                   if (REDIRECT_STATUS_CODES.includes(responseStatusCode)) {
+                       const redirect = responseHeaders.find(header => header.name.toLowerCase() == "location").value;
+                       if (redirect) {
+                           url = new URL(redirect, url).href;
                        }
-                   };
+                   } else {
+                       httpInfo = {
+                           request: {
+                               url: request.url,
+                               method: request.method,
+                               headers: request.headers,
+                               referrerPolicy: request.referrerPolicy
+                           },
+                           resourceType,
+                           response: {
+                               status: responseStatusCode,
+                               statusText: responseStatusText,
+                               headers: responseHeaders
+                           }
+                       };
+                   }
+               }
+               try {
+                   await Fetch.continueRequest({ requestId });
+               } catch (error) {
+                   // ignored
                }
-           }
-           try {
-               await Fetch.continueRequest({ requestId });
-           } catch (error) {
-               // ignored
            }
        });
        if (options.httpHeaders && options.httpHeaders.length) {
diff --git a/options.js b/options.js
index 526e57c..d3dd9b2 100644
--- a/options.js
+++ b/options.js
@@ -136,6 +136,8 @@ const OPTIONS_INFO = {
    "output-directory": { description: "Path to where to save files, this path must exist.", type: "string" },
    "version": { description: "Print the version number and exit.", type: "boolean" },
    "output-json": { description: "Output the result as a JSON string containing the page and network info", type: "boolean" },
+   "page-domain-allowlist": { description: "Only these domains will be allowed to load", type: "string[]" },
+   "page-domain-blocklist": { description: "These domains will be blocked", type: "string[]" },
    "insert-single-file-comment": { description: "Insert a comment in the HTML header with the URL of the page", type: "boolean", defaultValue: true },
    "resolve-links": { description: "Resolve link URLs to absolute URLs", type: "boolean", defaultValue: true },
    "settings-file": { description: "Path to a JSON file containing the settings exported from the web extension", type: "string" },
diff --git a/single-file-cli-api.js b/single-file-cli-api.js
index ebef49f..375e8b5 100644
--- a/single-file-cli-api.js
+++ b/single-file-cli-api.js
@@ -26,6 +26,7 @@
 import * as backend from "./lib/cdp-client.js";
 import { getZipScriptSource } from "./lib/single-file-script.js";
 import { Deno, path } from "./lib/deno-polyfill.js";
+import process from 'node:process';

 const VALID_URL_TEST = /^(https?|file):\/\//;

@@ -301,6 +302,9 @@ async function capturePage(options) {
        } else {
            console.error(error.message || error, message); // eslint-disable-line no-console
        }
+       if (!process.exitCode || process.exitCode === 0) {
+           process.exitCode = 1;
+       }
    }
 }
gildas-lormeau commented 2 months ago

I'm glad to see you've managed to implement the options. I'll gladly accept a PR. Note that you can replace const patterns = handleAuthRequests ? ... with const patterns = [{ requestStage: “Request” }, { requestStage: “Response”, resourceType: “Document” }] (conditional ternary operator removed).