Closed why-you-trust-me closed 6 months ago
what is your config?
what is your config?
export const defaultConfig: Config = {
url: "https://ts.xcatliu.com/basics/index.html",
match: "https://ts.xcatliu.com/basics/**",
selector: .docs-builder-container
,
maxPagesToCrawl: 1000,
outputFileName: "output.json",
};
I have fix this problem, this project only can craw company site
INFO PlaywrightCrawler: Starting the crawler. WARN PlaywrightCrawler: Reclaiming failed request back to the list or queue. page.goto: net::ERR_EMPTY_RESPONSE at https://ts.xcatliu.com/basics/index.html Call log:
navigating to "https://ts.xcatliu.com/basics/index.html", waiting until "load"
{"id":"mSxkEpU7QefaoZ1","url":"https://ts.xcatliu.com/basics/index.html","retryCount":1} WARN PlaywrightCrawler: Reclaiming failed request back to the list or queue. page.goto: net::ERR_EMPTY_RESPONSE at https://ts.xcatliu.com/basics/index.html Call log:
navigating to "https://ts.xcatliu.com/basics/index.html", waiting until "load"
{"id":"mSxkEpU7QefaoZ1","url":"https://ts.xcatliu.com/basics/index.html","retryCount":2} WARN PlaywrightCrawler: Reclaiming failed request back to the list or queue. page.goto: net::ERR_EMPTY_RESPONSE at https://ts.xcatliu.com/basics/index.html Call log:
navigating to "https://ts.xcatliu.com/basics/index.html", waiting until "load"
{"id":"mSxkEpU7QefaoZ1","url":"https://ts.xcatliu.com/basics/index.html","retryCount":3} ERROR PlaywrightCrawler: Request failed and reached maximum retries. page.goto: net::ERR_EMPTY_RESPONSE at https://ts.xcatliu.com/basics/index.html Call log:
navigating to "https://ts.xcatliu.com/basics/index.html", waiting until "load"
at gotoExtended (C:\workspace\code\gptbuilder\gpt-crawler\node_modules\@crawlee\playwright\internals\utils\playwright-utils.js:154:17) at PlaywrightCrawler._navigationHandler (C:\workspace\code\gptbuilder\gpt-crawler\node_modules\@crawlee\playwright\internals\playwright-crawler.js:112:52) at PlaywrightCrawler._handleNavigation (C:\workspace\code\gptbuilder\gpt-crawler\node_modules\@crawlee\browser\internals\browser-crawler.js:299:51) at async PlaywrightCrawler._runRequestHandler (C:\workspace\code\gptbuilder\gpt-crawler\node_modules\@crawlee\browser\internals\browser-crawler.js:236:13) at async PlaywrightCrawler._runRequestHandler (C:\workspace\code\gptbuilder\gpt-crawler\node_modules\@crawlee\playwright\internals\playwright-crawler.js:109:9) at async wrap (C:\workspace\code\gptbuilder\gpt-crawler\node_modules\@apify\timeout\index.js:52:21) {"id":"mSxkEpU7QefaoZ1","url":"https://ts.xcatliu.com/basics/index.html","method":"GET","uniqueKey":"https://ts.xcatliu.com/basics/index.html"} INFO PlaywrightCrawler: All requests from the queue have been processed, the crawler will shut down. INFO PlaywrightCrawler: Final request statistics: {"requestsFinished":0,"requestsFailed":1,"retryHistogram":[null,null,null,1],"requestAvgFailedDurationMillis":1503,"requestAvgFinishedDurationMillis":null,"requestsFinishedPerMinute":0,"requestsFailedPerMinute":3,"requestTotalDurationMillis":1503,"requestsTotal":1,"crawlerRuntimeMillis":15665} INFO PlaywrightCrawler: Error analysis: {"totalErrors":1,"uniqueErrors":1,"mostCommonErrors":["1x: page.goto: net::ERR_EMPTY_RESPONSE at https://ts.xcatliu.com/basics/index.html (C:\workspace\code\gptbuilder\gpt-crawler\node_modules\@crawlee\playwright\internals\utils\playwright-utils.js:154:17)"]} INFO PlaywrightCrawler: Finished! Total 1 requests: 0 succeeded, 1 failed. {"terminal":true}