BuilderIO / gpt-crawler

Crawl a site to generate knowledge files to create your own custom GPT from a URL
https://www.builder.io/blog/custom-gpt
ISC License
18.58k stars 1.97k forks source link

Script Not Crawling Subdirectories During Website Scraping #166

Open AhsanAk opened 3 months ago

AhsanAk commented 3 months ago

I have written a script to scrape a website. Initially, it includes directories, and inside each directory, there are subdirectories. The problem is that the program is not crawling through the subdirectories. The website I am going to scrape is:

https://t24-documentation.finductive.com/Solutions/T24_Transact/Accounts/

image

The script I have written:

import { Config } from "./src/config";

const url = "https://t24-documentation.finductive.com/Solutions/T24_Transact/Accounts/";
const match = url + "/**";
const fileName = "output";
export const defaultConfig: Config = {
    url,
    match,
    resourceExclusions: [
        'png', 'jpg', 'jpeg', 'gif', 'svg', 'css', 'js', 'ico', 'woff', 'woff2', 'ttf', 'eot', 'otf', 'mp4', 'mp3', 'webm', 'ogg', 'wav', 'flac', 'aac', 'zip', 'tar', 'gz', 'rar', '7z', 'exe', 'dmg', 'apk', 'csv', 'xls', 'xlsx', 'doc', 'docx', 'pdf', 'epub', 'iso', 'bin', 'ppt', 'pptx', 'odt', 'avi', 'mkv', 'xml', 'json', 'yml', 'yaml', 'rss', 'atom', 'swf', 'txt', 'dart', 'webp', 'bmp', 'tif', 'psd', 'ai', 'indd', 'eps', 'ps', 'zipx', 'srt', 'wasm', 'm4v', 'm4a', 'webp', 'weba', 'm4b', 'opus', 'ogv', 'ogm', 'oga', 'spx', 'ogx', 'flv', '3gp', '3g2', 'jxr', 'wdp', 'jng', 'hief', 'avif', 'apng', 'avifs', 'heif', 'heic', 'cur', 'ico', 'ani', 'jp2', 'jpm', 'jpx', 'mj2', 'wmv', 'wma', 'aac', 'tif', 'tiff', 'mpg', 'mpeg', 'mov', 'avi', 'wmv', 'flv', 'swf', 'mkv', 'm4v', 'm4p', 'm4b', 'm4r', 'm4a', 'mp3', 'wav', 'wma', 'ogg', 'oga', 'webm', '3gp', '3g2', 'flac', 'spx', 'amr', 'mid', 'midi', 'mka', 'dts', 'ac3', 'eac3', 'weba', 'm3u', 'm3u8', 'ts', 'wpl', 'pls', 'vob', 'ifo', 'bup', 'svcd', 'drc', 'dsm', 'dsv', 'dsa', 'dss', 'vivo', 'ivf', 'dvd', 'fli', 'flc', 'flic', 'mng', 'asf', 'm2v', 'asx', 'ram', 'ra', 'rm', 'rpm', 'roq', 'smi', 'smil', 'wmf', 'wmz', 'wmd', 'wvx', 'wmx', 'movie', 'wri', 'ins', 'isp', 'acsm', 'djvu', 'fb2', 'xps', 'oxps', 'ps', 'eps', 'ai', 'prn', 'svg', 'dwg', 'dxf', 'ttf', 'fnt', 'fon', 'otf', 'cab'
    ],
    maxPagesToCrawl: 1000000, // High number to ensure broad crawl
    outputFileName: fileName + ".json",
    maxFileSize: 5, // Max file size in MB
    maxTokens: 999000000, // Very high limit for tokens
};

Config: image

Output: image