n4ze3m / dialoqbase

Create chatbots with ease
https://dialoqbase.n4ze3m.com/
MIT License
1.59k stars 261 forks source link

Crawler or Sitemap is too slow #289

Open edwhiskey opened 1 month ago

edwhiskey commented 1 month ago

20240807_2245 The Crawler or Sitemap is too slow? Any Chance to improve it to allow multisite crawler which can use more CPU and GPU power?

edwhiskey commented 1 month ago

I edited the TS as below, and now it get much better performance on Crawler

import axios from "axios"; import { CheerioAPI, load } from "cheerio"; import puppeteerFetch from "./puppeteer-fetch"; import { Worker, isMainThread, parentPort, workerData } from "worker_threads";

type CrawlResult = { links: Set; errors: Set; };

const visitedLinks: Set = new Set(); const errorLinks: Set = new Set(); const queuedLinks: Set = new Set();

const crawlPage = async (url: string) => { // Your crawling logic here const html = await axios.get(url); const $ = load(html.data); const links = new Set(); // Add link to set return { links }; };

import axios from "axios"; import { CheerioAPI, load } from "cheerio"; import puppeteerFetch from "./puppeteer-fetch"; import { Worker, isMainThread, parentPort, workerData } from "worker_threads";

type CrawlResult = { links: Set; };

const crawlPage = async (url: string) => { // Your crawling logic here const html = await axios.get(url); const $ = load(html.data); const links = new Set(); // Add link to set return { links }; };

const crawlUrls = async (urls: string[]) => { const results: CrawlResult[] = []; for (const url of urls) { const result = await crawlPage(url); results.push(result); } return results; };

if (!isMainThread) { // In worker thread, get the data from parent const { crawlUrl } = workerData;

// Do some work here... crawlUrls([crawlUrl]) .then((results) => { console.log(results); }) .catch((err) => { console.error(err); }); } else { // In main thread, spawn multiple workers const urlsToCrawl = ["url1", "url2", ...]; const numWorkers = process.cpuCount(); const chunkSize = Math.ceil(urlsToCrawl.length / numWorkers);

for (let i = 0; i < numWorkers; i++) { const start = i * chunkSize; const end = Math.min(start + chunkSize, urlsToCrawl.length); const chunkUrls = urlsToCrawl.slice(start, end); const worker = new Worker(__filename, { workerData: { crawlUrl: chunkUrls }, }); worker.on("message", (result) => { console.log(result); }); } }

export const crawl = async ( startUrl: string, maxDepth = 2, maxLinks = 20, usePuppeteerFetch = false ): Promise => { const queue: { url: string; depth: number }[] = [{ url: startUrl, depth: 0 }]; const fetchedLinks: Set = new Set();

while (queue.length > 0 && visitedLinks.size < maxLinks) { const batch = queue.splice(0, Math.min(queue.length, maxLinks - visitedLinks.size));

await Promise.all(
  batch.map(async ({ url, depth }) => {
    if (visitedLinks.has(url) || depth > maxDepth) {
      return;
    }

    try {

      let $: CheerioAPI;

      if (usePuppeteerFetch) {
        const response = await puppeteerFetch(url);
        $ = load(response);
      } else {
        const response = await axios.get(url, {
          headers: {
            Accept: "text/html",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
          },
        });

        const contentType = response.headers['content-type'];
        if (!contentType || !contentType.includes("text/html")) {
          return;
        }

        $ = load(response.data);
      }
      visitedLinks.add(url);
      fetchedLinks.add(url);

      $("a").each((_, element) => {
        const href = $(element).attr("href");
        if (!href) {
          return;
        }

        const absoluteUrl = normalizeUrl(new URL(href, url).href);
        if (isSameDomain(absoluteUrl, startUrl) && !visitedLinks.has(absoluteUrl) && !queuedLinks.has(absoluteUrl)) {
          queue.push({ url: absoluteUrl, depth: depth + 1 });
          queuedLinks.add(absoluteUrl);
        }
      });
    } catch (error: any) {
      console.error(`Failed to fetch ${url}:`, error?.message || error);
      errorLinks.add(url);
    }
  })
);

}

return { links: fetchedLinks, errors: errorLinks }; };

const isSameDomain = (url1: string, url2: string): boolean => { const { hostname: host1 } = new URL(url1); const { hostname: host2 } = new URL(url2); return host1 === host2; };

const normalizeUrl = (url: string): string => { try { const urlObj = new URL(url); urlObj.hash = ''; return urlObj.href; } catch (error) { return url; } };