Closed sdht0 closed 2 months ago
First, to clear up any misunderstanding, the code in the sinlgle file can be executed in Node.js or Deno. However, I've removed the puppeteer dependency and use the Chome DevTools protocol directly with this library.
Today, you would have to update the following line of code and probably remove resourceType: "Document"
Then, you would have to add you logic before the code below in order to block unwanted requests/responses.
See here for the documentation of the API: https://chromedevtools.github.io/devtools-protocol/tot/Fetch/
Let me know if you need more information.
Thanks a lot for the pointers. I managed to implement it as below.
This allows one to block say all 3rd party domains, e.g., ./single-file --page-domain-allowlist=www.sciencealert.com 'https://www.sciencealert.com/stunning-new-color-map-of-mars-reveals-surface-in-unprecedented-detail'
(Also implemented a way to exit on HTTP or other errors with different exit codes.)
Happy to open a PR if you think the domain blocking feature will be generally useful.
diff --git a/lib/cdp-client.js b/lib/cdp-client.js
index 4091692..0b7c1b1 100644
--- a/lib/cdp-client.js
+++ b/lib/cdp-client.js
@@ -27,6 +27,8 @@ import { launchBrowser, closeBrowser } from "./browser.js";
import { getScriptSource, getHookScriptSource } from "./single-file-script.js";
import { CDP, options } from "simple-cdp";
+import process from 'node:process';
+
const LOAD_TIMEOUT_ERROR = "ERR_LOAD_TIMEOUT";
const NETWORK_STATES = ["InteractiveTime", "networkIdle", "networkAlmostIdle", "load", "DOMContentLoaded"];
const MINIMIZED_WINDOW_STATE = "minimized";
@@ -106,7 +108,7 @@ async function getPageData(options) {
}
}
const handleAuthRequests = Boolean(options.httpProxyUsername);
- const patterns = handleAuthRequests ? [{ requestStage: "Request" }, { requestStage: "Response", resourceType: "Document" }] : [{ requestStage: "Response", resourceType: "Document" }];
+ const patterns = handleAuthRequests ? [{ requestStage: "Request" }, { requestStage: "Response", resourceType: "Document" }] : [{ requestStage: "Request" }, { requestStage: "Response", resourceType: "Document" }];
await Fetch.enable({ handleAuthRequests, patterns });
if (handleAuthRequests) {
Fetch.addEventListener("authRequired", ({ params }) => Fetch.continueWithAuth({
@@ -127,33 +129,43 @@ async function getPageData(options) {
let httpInfo;
Fetch.addEventListener("requestPaused", async ({ params }) => {
const { requestId, request, resourceType, responseHeaders, responseStatusCode, responseStatusText } = params;
- if (options.outputJson && !httpInfo && (request.url == url || request.url == alternativeUrl) && responseStatusCode !== undefined) {
- if (REDIRECT_STATUS_CODES.includes(responseStatusCode)) {
- const redirect = responseHeaders.find(header => header.name.toLowerCase() == "location").value;
- if (redirect) {
- url = new URL(redirect, url).href;
- }
- } else {
- httpInfo = {
- request: {
- url: request.url,
- method: request.method,
- headers: request.headers,
- referrerPolicy: request.referrerPolicy
- },
- resourceType,
- response: {
- status: responseStatusCode,
- statusText: responseStatusText,
- headers: responseHeaders
+ const host = new URL(request.url).host;
+ if (!((options.pageDomainAllowlists && options.pageDomainAllowlists.includes(host)) ||
+ (options.pageDomainBlocklists && options.pageDomainBlocklists.length > 0 && !options.pageDomainBlocklists.includes(host)))) {
+ await Fetch.failRequest({ requestId, errorReason: "ConnectionRefused"});
+ } else {
+ if (responseStatusCode !== undefined && responseStatusCode >= 400 ) {
+ console.log(`Error: code:${responseStatusCode} for '${request.url}'`);
+ process.exit(2);
+ }
+ if (options.outputJson && !httpInfo && (request.url == url || request.url == alternativeUrl) && responseStatusCode !== undefined) {
+ if (REDIRECT_STATUS_CODES.includes(responseStatusCode)) {
+ const redirect = responseHeaders.find(header => header.name.toLowerCase() == "location").value;
+ if (redirect) {
+ url = new URL(redirect, url).href;
}
- };
+ } else {
+ httpInfo = {
+ request: {
+ url: request.url,
+ method: request.method,
+ headers: request.headers,
+ referrerPolicy: request.referrerPolicy
+ },
+ resourceType,
+ response: {
+ status: responseStatusCode,
+ statusText: responseStatusText,
+ headers: responseHeaders
+ }
+ };
+ }
+ }
+ try {
+ await Fetch.continueRequest({ requestId });
+ } catch (error) {
+ // ignored
}
- }
- try {
- await Fetch.continueRequest({ requestId });
- } catch (error) {
- // ignored
}
});
if (options.httpHeaders && options.httpHeaders.length) {
diff --git a/options.js b/options.js
index 526e57c..d3dd9b2 100644
--- a/options.js
+++ b/options.js
@@ -136,6 +136,8 @@ const OPTIONS_INFO = {
"output-directory": { description: "Path to where to save files, this path must exist.", type: "string" },
"version": { description: "Print the version number and exit.", type: "boolean" },
"output-json": { description: "Output the result as a JSON string containing the page and network info", type: "boolean" },
+ "page-domain-allowlist": { description: "Only these domains will be allowed to load", type: "string[]" },
+ "page-domain-blocklist": { description: "These domains will be blocked", type: "string[]" },
"insert-single-file-comment": { description: "Insert a comment in the HTML header with the URL of the page", type: "boolean", defaultValue: true },
"resolve-links": { description: "Resolve link URLs to absolute URLs", type: "boolean", defaultValue: true },
"settings-file": { description: "Path to a JSON file containing the settings exported from the web extension", type: "string" },
diff --git a/single-file-cli-api.js b/single-file-cli-api.js
index ebef49f..375e8b5 100644
--- a/single-file-cli-api.js
+++ b/single-file-cli-api.js
@@ -26,6 +26,7 @@
import * as backend from "./lib/cdp-client.js";
import { getZipScriptSource } from "./lib/single-file-script.js";
import { Deno, path } from "./lib/deno-polyfill.js";
+import process from 'node:process';
const VALID_URL_TEST = /^(https?|file):\/\//;
@@ -301,6 +302,9 @@ async function capturePage(options) {
} else {
console.error(error.message || error, message); // eslint-disable-line no-console
}
+ if (!process.exitCode || process.exitCode === 0) {
+ process.exitCode = 1;
+ }
}
}
I'm glad to see you've managed to implement the options. I'll gladly accept a PR. Note that you can replace const patterns = handleAuthRequests ? ...
with const patterns = [{ requestStage: “Request” }, { requestStage: “Response”, resourceType: “Document” }]
(conditional ternary operator removed).
In an earlier version of single-file-cli which used
puppeteer
, I was able to locally implement a simple domain blocker like so:This allows me to greatly reduce the time it takes a save certain websites by blocking all 3rd-party domains, for instance.
However, I have not yet been able to port this to the latest code. Is there a simple way to do this in deno?