Closed marcelovicentegc closed 5 months ago
// For more information, see https://crawlee.dev/ import { PlaywrightCrawler } from "crawlee"; import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; import { config } from "../config.js"; import { Page } from "playwright";
export function getPageHtml(page: Page) { return page.evaluate((selector) => { const el = document.querySelector(selector) as HTMLElement | null; return el?.innerText || ""; }, config.selector); }
if (process.env.NO_CRAWL !== "true") { // PlaywrightCrawler crawls the web using a headless // browser controlled by the Playwright library. const crawler = new PlaywrightCrawler({ // Use the requestHandler to process each of the crawled pages. async requestHandler({ request, page, enqueueLinks, log, pushData }) {
if (config.cookie) {
// Set the cookie for the specific URL
const cookie = {
name: config.cookie.name,
value: config.cookie.value,
url: request.loadedUrl,
};
await page.context().addCookies([cookie]);
}
const title = await page.title();
log.info(`Crawling ${request.loadedUrl}...`);
await page.waitForSelector(config.selector, {
timeout: config.waitForSelectorTimeout ?? 1000,
});
const html = await getPageHtml(page);
// Save results as JSON to ./storage/datasets/default
await pushData({ title, url: request.loadedUrl, html });
if (config.onVisitPage) {
await config.onVisitPage({ page, pushData });
}
// Extract links from the current page
// and add them to the crawling queue.
await enqueueLinks({
globs: [config.match],
});
},
// Comment this option to scrape the full website.
maxRequestsPerCrawl: config.maxPagesToCrawl,
// Uncomment this option to see the browser window.
// headless: false,
});
// Add first URL to the queue and start the crawl. await crawler.run([config.url]); }
const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true, });
const results = []; for (const file of jsonFiles) { const data = JSON.parse(await readFile(file, "utf-8")); results.push(data); }
await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
:tada: This PR is included in version 1.2.1 :tada:
The release is available on:
Your semantic-release bot :package::rocket:
The goal of this PR is to setup an automated test pipeline for PRs to make sure that nothing breaks with upcoming changes and safeguard this project's quality.
This PR includes the starting point to run automated tests against all APIs (cli, config file and docker images) to guarantee that the program builds and executes correctly with each upcoming change.
It also adds a workflow to validate (uses amann/action-semantic-pull-request) the title of PRs to make sure that versioning is kept consistent. This is a follow up from:
56
Related to:
47
48
88