code4craft / webmagic

A scalable web crawler framework for Java.
http://webmagic.io/
Apache License 2.0
11.43k stars 4.18k forks source link

我按照官方文档的代理方式,配置代理后。以多个线程启动爬虫,总是会出现ssl异常javax.net.ssl.SSLException: Received fatal alert: internal_error。单个线程不会 #1022

Open aya-momo opened 3 years ago

AzQiang97 commented 3 years ago

解决了吗?

aya-momo commented 3 years ago

没有

在 2021-09-17 14:17:18,"AzQiang97" @.***> 写道:

解决了吗?

— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub, or unsubscribe. Triage notifications on the go with GitHub Mobile for iOS or Android.

Tiger-ygq commented 2 years ago

添加下面两个java文件人后把用例的main方法换成第三个文件就好了 `package com.ygq.demo01.utils;

import java.io.IOException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map;

import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory;

import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.downloader.HttpClientRequestContext; import us.codecraft.webmagic.downloader.HttpUriRequestConverter; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpClientUtils;

/**

import java.io.IOException; import java.security.KeyManagementException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.Map;

import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager;

import org.apache.commons.lang3.JavaVersion; import org.apache.commons.lang3.SystemUtils; import org.apache.http.HttpException; import org.apache.http.HttpRequest; import org.apache.http.HttpRequestInterceptor; import org.apache.http.client.CookieStore; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory;

import us.codecraft.webmagic.Site; import us.codecraft.webmagic.downloader.CustomRedirectStrategy;

/**

} package com.ygq.demo01.domain;

import com.ygq.demo01.controller.GithubRepoPageProcessor; import com.ygq.demo01.utils.HttpClientDownloader; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.Pipeline;

@TargetUrl("https://github.com/\\w+/\\w+") @HelpUrl("https://github.com/\\w+") public class GithubRepo {

@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
private String name;

@ExtractByUrl("https://github\\.com/(\\w+)/.*")
private String author;

@ExtractBy("//div[@id='readme']/tidyText()")
private String readme;

public static void main(String[] args) {

// OOSpider.create(Site.me().setSleepTime(1000) // , new ConsolePageModelPipeline(), GithubRepo.class) // .addUrl("https://github.com/code4craft").thread(5).run(); Spider.create(new GithubRepoPageProcessor()).setDownloader(new HttpClientDownloader()).addUrl("https://github.com/code4craft").thread(5).run(); // OOSpider.create(new GithubRepoPageProcessor()).setDownloader(new HttpClientDownloader()).addUrl("https://github.com/code4craft").thread(5).run(); } }`