code4craft / webmagic

A scalable web crawler framework for Java.
http://webmagic.io/
Apache License 2.0
11.37k stars 4.18k forks source link

爬取百度百科需要耗时100多秒,请问怎么解决 #1160

Open yidasanqian opened 4 months ago

yidasanqian commented 4 months ago

日志: image

代码:

public class SpiderTest {

    private PhantomJSDownloader downloader;

    @BeforeEach
    public void setUp() {
        String osName = System.getProperty("os.name").toLowerCase();
        String phantomjsDriverPath = "/data/webdriver/phantomjs/";
        if (osName.contains("win")) {
            downloader = new PhantomJSDownloader(phantomjsDriverPath + "phantomjs.exe --ignore-ssl-errors=yes", phantomjsDriverPath + "crawl.js");
        } else {
            downloader = new PhantomJSDownloader(phantomjsDriverPath + "phantomjs --ignore-ssl-errors=yes", phantomjsDriverPath + "crawl.js");
        }
    }

    @Test
    public void testSpider() {
        long start = System.currentTimeMillis();
        List<WebPageResult> webPageResults = new ArrayList<>();
        WebPageResult webPageResult = new WebPageResult();
        webPageResult.setNo(0);
        webPageResult.setUrl("https://baike.baidu.com/item/2023%E5%B9%B410%E6%9C%88%E5%B7%B4%E4%BB%A5%E5%86%B2%E7%AA%81/63565377");
        webPageResults.add(webPageResult);
        ResultItems resultItems = Spider.create(new WebPageProcessor(webPageResults))
                .setDownloader(downloader)
                .get(webPageResult.getUrl());
        long end = System.currentTimeMillis();
        long cost = (end - start) / 1000;
        System.out.println("SpiderTest.testSpider cost: " + cost + "s");
    }
}

image