webis-de / scriptor

Plug-and-play reproducible web analysis.
MIT License
5 stars 2 forks source link

More robust snapshot script #31

Open johanneskiesel opened 2 years ago

johanneskiesel commented 2 years ago

Version of @phoerious

waitForLoadStateWithTimeout should probably be moved into pages.js.

The waiting-and-resizing-part could also be moved into pages.js. Likely helpful also for other scripts.

This script also covers the case of crawling several URLs at once. Not sure yet whether to keep that part. Probably yes.

const fs = require("fs-extra");
const path = require("path");

const { AbstractScriptorScript, files, pages, log } = require("@webis-de/scriptor");

const NAME = "Snapshot";
const VERSION = "0.2.0";

const waitForLoadStateWithTimeout = async (page, event, timeout) => {
  try {
    return await page.waitForLoadState(event, { timeout: timeout });
  } catch (ex) {
    return null;
  }
}

module.exports = class extends AbstractScriptorScript {

  constructor() {
    super(NAME, VERSION);
  }

  async run(browserContexts, scriptDirectory, inputDirectory, outputDirectory) {
    const browserContext = browserContexts[files.BROWSER_CONTEXT_DEFAULT];

    // Script options
    const defaultScriptOptions = {
      viewportAdjust: {},
      snapshot: {
        screenshot: { timeout: 120000 }  // Screenshotting complex pages can take a very long time
      }
    };
    const requiredScriptOptions = [ "url" ];
    const scriptOptions = files.readOptions(files.getExisting(
      files.SCRIPT_OPTIONS_FILE_NAME, [ scriptDirectory, inputDirectory ]),
      defaultScriptOptions, requiredScriptOptions);
    log.info({options: scriptOptions}, "script.options");

    fs.writeJsonSync(path.join(outputDirectory, files.SCRIPT_OPTIONS_FILE_NAME), scriptOptions);

    // Load page(s)
    let url = scriptOptions["url"];
    if (typeof url === "string") {
      url = [url];
    }

    const promises = [];
    for (const [i, u] of url.entries()) {
      const page = await browserContext.newPage();
      promises.push(page.goto(u, { waitUntil: "domcontentloaded" }).then(async (resp) => {
        await waitForLoadStateWithTimeout(page, "load", 10000);

        // Adjust viewport height to scroll height to trigger loading dynamic content
        await pages.adjustViewportToPage(page, scriptOptions["viewportAdjust"]);

        // Wait for three networkidle intervals to ensure dynamic content finished loading
        for (let i = 0; i < 3; ++i) {
          await waitForLoadStateWithTimeout(page, "networkidle", 3500);
        }

        // Update viewport up to three times to accomodate for layout changes and
        // to trigger further dynamic content
        let resizes = 0;
        while (resizes < 3 && await page.viewportSize().height !== await pages.getHeight(page)) {
          await pages.adjustViewportToPage(page, scriptOptions["viewportAdjust"]);
          await waitForLoadStateWithTimeout(page, "networkidle", 2500);
          await page.waitForTimeout(250);
          ++resizes;
        }

        // Take snapshot(s)
        const snapName = url.length > 1 ? `snapshot-${i}` : "snapshot";
        await pages.takeSnapshot(page, Object.assign(
            { path: path.join(outputDirectory, snapName) }, scriptOptions["snapshot"]
        ));
      }));
    }
    await Promise.all(promises);

    return true;
  }
};
johanneskiesel commented 2 years ago

New version with

        // Update viewport up to three times or until taller than 50k pixels to accomodate
        // for layout changes and to trigger further dynamic content
        for (let resizes = 0; resizes < 3; ++resizes) {
          const h = await page.viewportSize().height;
          if (h > 50000 || h === await pages.getHeight(page)) {
            // Viewport hasn't changed or already too tall
            break;
          }
          await pages.adjustViewportToPage(page, scriptOptions["viewportAdjust"]);
          await waitForLoadStateWithTimeout(page, "networkidle", 2500);
          await page.waitForTimeout(250);
        }

        // Reset to original size to avoid strange scaling effects on some pages
        await page.setViewportSize(origViewport);

=> this should really be a method of its own

johanneskiesel commented 2 years ago

Additional timeout after increased timeout. Delay after load to prevent error on getHeight.

const { AbstractScriptorScript, files, pages, log } = require("@webis-de/scriptor");

const NAME = "Snapshot";
const VERSION = "0.2.0";

const waitForLoadStateWithTimeout = async (page, event, timeout) => {
  try {
    return await page.waitForLoadState(event, { timeout: timeout });
  } catch (ex) {
    return null;
  }
}

const waitForNavigationWithTimeout = async (page, waitUntil, timeout) => {
  try {
    return await page.waitForNavigation({ waitUntil: waitUntil, timeout: timeout });
  } catch (ex) {
    return null;
  }
}

module.exports = class extends AbstractScriptorScript {

  constructor() {
    super(NAME, VERSION);
  }

  async run(browserContexts, scriptDirectory, inputDirectory, outputDirectory) {
    const browserContext = browserContexts[files.BROWSER_CONTEXT_DEFAULT];

    // Script options
    const defaultScriptOptions = {
      viewportAdjust: {},
      snapshot: {
        screenshot: { timeout: 120000 }  // Screenshotting complex pages can take a very long time
      }
    };
    const requiredScriptOptions = [ "url" ];
    const scriptOptions = files.readOptions(files.getExisting(
      files.SCRIPT_OPTIONS_FILE_NAME, [ scriptDirectory, inputDirectory ]),
      defaultScriptOptions, requiredScriptOptions);
    log.info({options: scriptOptions}, "script.options");

    fs.writeJsonSync(path.join(outputDirectory, files.SCRIPT_OPTIONS_FILE_NAME), scriptOptions);

    // Load page(s)
    let url = scriptOptions["url"];
    if (typeof url === "string") {
      url = [url];
    }

    const promises = [];
    for (const [i, u] of url.entries()) {
      const page = await browserContext.newPage();
      const origViewport = await page.viewportSize();
      promises.push(page.goto(u, { waitUntil: "domcontentloaded" }).then(async (resp) => {
        // Wait for load and any potential navigation thereafter
        await waitForLoadStateWithTimeout(page, "load", 20000);
        await waitForNavigationWithTimeout(page, "load", 1000);

        // Adjust viewport height to scroll height to trigger loading dynamic content
        await pages.adjustViewportToPage(page, scriptOptions["viewportAdjust"]);

        // Wait for three networkidle intervals to ensure dynamic content finished loading
        for (let i = 0; i < 3; ++i) {
          await waitForLoadStateWithTimeout(page, "networkidle", 6000);
        }

        // Update viewport up to three times or until taller than 50k pixels to accomodate
        // for layout changes and to trigger further dynamic content
        for (let resizes = 0; resizes < 3; ++resizes) {
          const h = await page.viewportSize().height;
          if (h > 50000 || h === await pages.getHeight(page)) {
            // Viewport hasn't changed or already too tall
            break;
          }
          await pages.adjustViewportToPage(page, scriptOptions["viewportAdjust"]);
          await page.waitForTimeout(500);
          await waitForLoadStateWithTimeout(page, "networkidle", 6000);
        }

        // Reset to original size to avoid strange scaling effects on some pages
        await page.setViewportSize(origViewport);
        await page.waitForTimeout(500);
        await waitForLoadStateWithTimeout(page, "networkidle", 6000);

        // Take snapshot(s)
        const snapName = url.length > 1 ? `snapshot-${i}` : "snapshot";
        await pages.takeSnapshot(page, Object.assign(
            { path: path.join(outputDirectory, snapName) }, scriptOptions["snapshot"]
        ));
      }));
    }
    await Promise.all(promises);

    return true;
  }
};
phoerious commented 2 years ago

Optimised and with ad blocking:

const fs = require("fs-extra");
const path = require("path");

const { AbstractScriptorScript, files, pages, log } = require("@webis-de/scriptor");
const { PlaywrightBlocker } = require("@cliqz/adblocker-playwright");

const NAME = "Snapshot";
const VERSION = "0.3.0";

const waitForLoadStateWithTimeout = async (page, event, timeout) => {
  try {
    return await page.waitForLoadState(event, { timeout: timeout });
  } catch (ex) {
    return null;
  }
};

const waitForNavigationWithTimeout = async (page, waitUntil, timeout) => {
  try {
    return await page.waitForNavigation({ waitUntil: waitUntil, timeout: timeout });
  } catch (ex) {
    return null;
  }
};

const loadBlocker = () => {
  const path = '/tmp/scriptor/playwright-adblocker.bin';
  if (fs.existsSync(path)) {
    return PlaywrightBlocker.deserialize(fs.readFileSync(path));
  }
  const b = PlaywrightBlocker.parse(fs.readFileSync('/script/blocklist.txt', 'utf-8'));
  fs.writeFileSync(path, b.serialize());
  return b;
};
const blocker = loadBlocker();

module.exports = class extends AbstractScriptorScript {

  constructor() {
    super(NAME, VERSION);
  }

  async run(browserContexts, scriptDirectory, inputDirectory, outputDirectory) {
    const browserContext = browserContexts[files.BROWSER_CONTEXT_DEFAULT];

    // Script options
    const defaultScriptOptions = {
      viewportAdjust: {},
      snapshot: {
        screenshot: { timeout: 120000 }  // Screenshotting complex pages can take a very long time
      }
    };
    const requiredScriptOptions = [ "url" ];
    const scriptOptions = files.readOptions(files.getExisting(
      files.SCRIPT_OPTIONS_FILE_NAME, [ scriptDirectory, inputDirectory ]),
      defaultScriptOptions, requiredScriptOptions);
    log.info({options: scriptOptions}, "script.options");

    fs.writeJsonSync(path.join(outputDirectory, files.SCRIPT_OPTIONS_FILE_NAME), scriptOptions);

    // Load page(s)
    let url = scriptOptions["url"];
    if (typeof url === "string") {
      url = [url];
    }

    const promises = [];
    for (const [i, u] of url.entries()) {
      const page = await browserContext.newPage();
      await blocker.enableBlockingInPage(page);
      const origViewport = await page.viewportSize();
      promises.push(page.goto(u, { waitUntil: "domcontentloaded" }).then(async (resp) => {
        // Wait for load and any potential navigation thereafter
        await waitForLoadStateWithTimeout(page, "load", 20000);
        await waitForNavigationWithTimeout(page, "load", 1000);

        // Adjust viewport height to scroll height to trigger loading dynamic content
        await pages.adjustViewportToPage(page, scriptOptions["viewportAdjust"]);

        // Wait for three networkidle intervals to ensure dynamic content finished loading
        for (let i = 0; i < 3; ++i) {
          await waitForLoadStateWithTimeout(page, "networkidle", 10000);
        }

        // Update viewport up to three times or until taller than 50k pixels to accomodate
        // for layout changes and to trigger further dynamic content
        for (let resizes = 0; resizes < 3; ++resizes) {
          const h = await page.viewportSize().height;
          if (h > 50000 || h === await pages.getHeight(page)) {
            // Viewport hasn't changed or already too tall
            break;
          }
          await pages.adjustViewportToPage(page, scriptOptions["viewportAdjust"]);
          await page.waitForTimeout(500);
          await waitForLoadStateWithTimeout(page, "networkidle", 6000);
          await waitForLoadStateWithTimeout(page, "networkidle", 6000);
        }

        // Reset to original width to avoid strange scaling effects on some pages
        await page.setViewportSize({width: origViewport.width, height: await pages.getHeight(page)});
        await page.waitForTimeout(500);
        await waitForLoadStateWithTimeout(page, "networkidle", 6000);

        // Take snapshot(s)
        const snapName = url.length > 1 ? `snapshot-${i}` : "snapshot";
        await pages.takeSnapshot(page, Object.assign(
            { path: path.join(outputDirectory, snapName) }, scriptOptions["snapshot"]
        ));
      }));
    }
    await Promise.all(promises);

    return true;
  }
};