Open edwhiskey opened 3 months ago
I edited the TS as below, and now it get much better performance on Crawler
import axios from "axios"; import { CheerioAPI, load } from "cheerio"; import puppeteerFetch from "./puppeteer-fetch"; import { Worker, isMainThread, parentPort, workerData } from "worker_threads";
type CrawlResult = {
links: Set
const visitedLinks: Set
const crawlPage = async (url: string) => { // Your crawling logic here const html = await axios.get(url); const $ = load(html.data); const links = new Set(); // Add link to set return { links }; };
import axios from "axios"; import { CheerioAPI, load } from "cheerio"; import puppeteerFetch from "./puppeteer-fetch"; import { Worker, isMainThread, parentPort, workerData } from "worker_threads";
type CrawlResult = {
links: Set
const crawlPage = async (url: string) => { // Your crawling logic here const html = await axios.get(url); const $ = load(html.data); const links = new Set(); // Add link to set return { links }; };
const crawlUrls = async (urls: string[]) => { const results: CrawlResult[] = []; for (const url of urls) { const result = await crawlPage(url); results.push(result); } return results; };
if (!isMainThread) { // In worker thread, get the data from parent const { crawlUrl } = workerData;
// Do some work here... crawlUrls([crawlUrl]) .then((results) => { console.log(results); }) .catch((err) => { console.error(err); }); } else { // In main thread, spawn multiple workers const urlsToCrawl = ["url1", "url2", ...]; const numWorkers = process.cpuCount(); const chunkSize = Math.ceil(urlsToCrawl.length / numWorkers);
for (let i = 0; i < numWorkers; i++) { const start = i * chunkSize; const end = Math.min(start + chunkSize, urlsToCrawl.length); const chunkUrls = urlsToCrawl.slice(start, end); const worker = new Worker(__filename, { workerData: { crawlUrl: chunkUrls }, }); worker.on("message", (result) => { console.log(result); }); } }
export const crawl = async (
startUrl: string,
maxDepth = 2,
maxLinks = 20,
usePuppeteerFetch = false
): Promise
while (queue.length > 0 && visitedLinks.size < maxLinks) { const batch = queue.splice(0, Math.min(queue.length, maxLinks - visitedLinks.size));
await Promise.all(
batch.map(async ({ url, depth }) => {
if (visitedLinks.has(url) || depth > maxDepth) {
return;
}
try {
let $: CheerioAPI;
if (usePuppeteerFetch) {
const response = await puppeteerFetch(url);
$ = load(response);
} else {
const response = await axios.get(url, {
headers: {
Accept: "text/html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
},
});
const contentType = response.headers['content-type'];
if (!contentType || !contentType.includes("text/html")) {
return;
}
$ = load(response.data);
}
visitedLinks.add(url);
fetchedLinks.add(url);
$("a").each((_, element) => {
const href = $(element).attr("href");
if (!href) {
return;
}
const absoluteUrl = normalizeUrl(new URL(href, url).href);
if (isSameDomain(absoluteUrl, startUrl) && !visitedLinks.has(absoluteUrl) && !queuedLinks.has(absoluteUrl)) {
queue.push({ url: absoluteUrl, depth: depth + 1 });
queuedLinks.add(absoluteUrl);
}
});
} catch (error: any) {
console.error(`Failed to fetch ${url}:`, error?.message || error);
errorLinks.add(url);
}
})
);
}
return { links: fetchedLinks, errors: errorLinks }; };
const isSameDomain = (url1: string, url2: string): boolean => { const { hostname: host1 } = new URL(url1); const { hostname: host2 } = new URL(url2); return host1 === host2; };
const normalizeUrl = (url: string): string => { try { const urlObj = new URL(url); urlObj.hash = ''; return urlObj.href; } catch (error) { return url; } };
The Crawler or Sitemap is too slow? Any Chance to improve it to allow multisite crawler which can use more CPU and GPU power?