Norconex / crawlers

Norconex Crawlers (or spiders) are flexible web and filesystem crawlers for collecting, parsing, and manipulating data from the web or filesystem to various data repositories such as search engines.
https://opensource.norconex.com/crawlers
Apache License 2.0
183 stars 68 forks source link

com.norconex.collector.core.CollectorException: javax.net.ssl.SSLHandshakeException #592

Closed HappyCustomers closed 4 years ago

HappyCustomers commented 5 years ago

I am getting the following SSL exception errors on few website.s I am able to open these websites in browser

1. (com.norconex.collector.core.CollectorException: javax.net.ssl.SSLHandshakeException: Remote host closed connection during handshake)`
(Remote host closed connection during handshake)
javax.net.ssl.SSLHandshakeException: Remote host closed connection during handshake
    at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:992)
    at sun.security.ssl.SSLSocketImpl.performInitialHandshake(SSLSocketImpl.java:1375)
    at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:1403)
    at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:1387)
    at org.apache.http.conn.ssl.SSLConnectionSocketFactory.createLayeredSocket(SSLConnectionSocketFactory.java:396)
    at org.apache.http.conn.ssl.SSLConnectionSocketFactory.connectSocket(SSLConnectionSocketFactory.java:355)
    at org.apache.http.impl.conn.DefaultHttpClientConnectionOperator.connect(DefaultHttpClientConnectionOperator.java:142)
    at org.apache.http.impl.conn.PoolingHttpClientConnectionManager.connect(PoolingHttpClientConnectionManager.java:359)
    at org.apache.http.impl.execchain.MainClientExec.establishRoute(MainClientExec.java:381)
    at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:237)
    at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185)
    at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89)
    at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:111)
    at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83)
    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56)
    at com.norconex.collector.http.fetch.impl.GenericDocumentFetcher.fetchDocument(GenericDocumentFetcher.java:219)
    at com.norconex.collector.http.pipeline.importer.DocumentFetcherStage.executeStage(DocumentFetcherStage.java:42)
    at com.norconex.collector.http.pipeline.importer.AbstractImporterStage.execute(AbstractImporterStage.java:31)
    at com.norconex.collector.http.pipeline.importer.AbstractImporterStage.execute(AbstractImporterStage.java:24)
    at com.norconex.commons.lang.pipeline.Pipeline.execute(Pipeline.java:91)
    at com.norconex.collector.http.crawler.HttpCrawler.executeImporterPipeline(HttpCrawler.java:361)
    at com.norconex.collector.core.crawler.AbstractCrawler.processNextQueuedCrawlData(AbstractCrawler.java:538)
    at com.norconex.collector.core.crawler.AbstractCrawler.processNextReference(AbstractCrawler.java:419)
    at com.norconex.collector.core.crawler.AbstractCrawler$ProcessReferencesRunnable.run(AbstractCrawler.java:820)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.EOFException: SSL peer shut down incorrectly
    at sun.security.ssl.InputRecord.read(InputRecord.java:505)
    at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:973)
2.(com.norconex.collector.core.CollectorException: javax.net.ssl.SSLHandshakeException: Received fatal alert: handshake_failure)
(Received fatal alert: handshake_failure)
javax.net.ssl.SSLHandshakeException: Received fatal alert: handshake_failure
    at sun.security.ssl.Alerts.getSSLException(Alerts.java:192)
    at sun.security.ssl.Alerts.getSSLException(Alerts.java:154)
    at sun.security.ssl.SSLSocketImpl.recvAlert(SSLSocketImpl.java:2023)
    at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:1125)
    at sun.security.ssl.SSLSocketImpl.performInitialHandshake(SSLSocketImpl.java:1375)
    at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:1403)
    at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:1387)
    at org.apache.http.conn.ssl.SSLConnectionSocketFactory.createLayeredSocket(SSLConnectionSocketFactory.java:396)
    at org.apache.http.conn.ssl.SSLConnectionSocketFactory.connectSocket(SSLConnectionSocketFactory.java:355)
    at org.apache.http.impl.conn.DefaultHttpClientConnectionOperator.connect(DefaultHttpClientConnectionOperator.java:142)
    at org.apache.http.impl.conn.PoolingHttpClientConnectionManager.connect(PoolingHttpClientConnectionManager.java:359)
    at org.apache.http.impl.execchain.MainClientExec.establishRoute(MainClientExec.java:381)
    at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:237)
    at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185)
    at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89)
    at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:111)
    at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83)
    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56)
    at com.norconex.collector.http.fetch.impl.GenericDocumentFetcher.fetchDocument(GenericDocumentFetcher.java:219)
    at com.norconex.collector.http.pipeline.importer.DocumentFetcherStage.executeStage(DocumentFetcherStage.java:42)
    at com.norconex.collector.http.pipeline.importer.AbstractImporterStage.execute(AbstractImporterStage.java:31)
    at com.norconex.collector.http.pipeline.importer.AbstractImporterStage.execute(AbstractImporterStage.java:24)
    at com.norconex.commons.lang.pipeline.Pipeline.execute(Pipeline.java:91)
    at com.norconex.collector.http.crawler.HttpCrawler.executeImporterPipeline(HttpCrawler.java:361)
    at com.norconex.collector.core.crawler.AbstractCrawler.processNextQueuedCrawlData(AbstractCrawler.java:538)
    at com.norconex.collector.core.crawler.AbstractCrawler.processNextReference(AbstractCrawler.java:419)
    at com.norconex.collector.core.crawler.AbstractCrawler$ProcessReferencesRunnable.run(AbstractCrawler.java:820)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
3. (com.norconex.collector.core.CollectorException: javax.net.ssl.SSLException: Received fatal alert: internal_error)`

I have enabled the following configurations

<httpClientFactory class="com.norconex.collector.http.client.impl.GenericHttpClientFactory">
            <trustAllSSLCertificates>true</trustAllSSLCertificates> 
             <cookiesDisabled>false</cookiesDisabled>   
        </httpClientFactory>
<robotsTxt ignore="true"/>
        <robotsMeta ignore="true" />
        <sitemapResolverFactory ignore="true" />

I am using the

norconex-collector-http-2.8.2-SNAPSHOT

java version "1.8.0_111"

please advise and let me know if you need further information

shradhatx commented 5 years ago

Your firewall is stopping the request from your server. Can you curl the urls you are trying to crawl?

HappyCustomers commented 5 years ago

I am able to get the site using curl. I do not think there is any firewall issue

essiembre commented 5 years ago

@HappyCustomers, can you share the site you are attempting to crawl? If you cannot, one suggestion I can make is to check with a more recent version of Java (e.g. Java 8u211, or even Java 11) just to confirm or rule out whether it is an SSL crypto algorithm being used by the website that your version of Java does not support. This could help figure it out as well: https://java.com/en/jre-jdk-cryptoroadmap.html.

Another thing to try maybe is to install the SSL certificate in your Java keystore (you can find online a few tutorials for that).

essiembre commented 5 years ago

Sorry, I just realized you sent me sample URLs by email. Looking at it, I suspect the certificate authority is not trusted by your java installation, so I would try installing the certificate manually. You can find a similar issue with a resolution at #581.

HappyCustomers commented 5 years ago

Hi Pascal, we followed all the steps as mentioned in issue #581. we also imported the specific site certificates a to the Java keystore. We are still unable to crawl the sites mentioned in the email.
we are not in the position to change the Java version as it is dependent on other components which are using Java. Please advise.

essiembre commented 5 years ago

Set <trustAllSSLCertificates> to false and modify the launch script to add -Djavax.net.debug=all to the java command. It will list the trusted certificates currently in your trust store. Do you see the one you added in there? Your issuer must be in there.

If it is not but you know you have added it, it may be that the JVM used to run the Collector is not pointing to the trust store where you have added it. You can add the following VM argument to the java command: -Djavax.net.ssl.trustStore=/path/to/your/trust/store Then your certificate issuer should be displayed when you run the collector. If it is still not displayed, it likely was not added properly.

HappyCustomers commented 5 years ago

Hi Essiembre, I tried your suggestions. Made the following changes in the config document <trustAllSSLCertificates>false</trustAllSSLCertificates>

In the bat file -Djavax.net.ssl.trustStore="C:\Program Files\Java\jre1.8.0_111\lib\security\cacerts"

With the above the configuration I am able to extract the SSL websites which were failing earllier

When I changed the trustAllSSLCertificates to trueand the bat file having the cacerts path as below. I am getting the

(com.norconex.collector.core.CollectorException: javax.net.ssl.SSLHandshakeException: Received fatal alert: handshake_failure)

<trustAllSSLCertificates>true</trustAllSSLCertificates> -Djavax.net.ssl.trustStore="C:\Program Files\Java\jre1.8.0_111\lib\security\cacerts"

The question is already I have extracted SSL websites with trustAllSSLCertificatessettings as true. Need to understand what should be the universal settings to extract all the SSL websites.

essiembre commented 5 years ago

I would like to have it universal too. :-) Apparently trusting all certificate used to be enough, but it is no longer with more recent versions of SSL or just certain crypto. I am open if anybody has suggestions on how best to tackle this in Java. I am now thinking the best may be to automate the collector to perform the auto-installation of certificates in its own store. I can make this a feature request if nobody has a better idea.

HappyCustomers commented 5 years ago

Hi Essiembre, Automating the collector to install certificates in its own store is a workable solution with a flag in the configuration for the user to select auto installation of certificate- true/false. Thank you once again for prompt and workable solutions. Regards

essiembre commented 4 years ago

Turns out HTTPClient was not fully taking into account our own catch-all trust manager with trustAllSSLCertificates was true. Changed it to use Apache HttpClient TrustAllStrategy when creating an SSLContext. No longer reproducible since.

angelo337 commented 4 years ago

Hi There Pascal: I am trying to crawl a Web site: https://www.eltiempo.com/ and after I follow all Steps from this case and read documentations on updating CA Certs, however still getting this type of error: "Received fatal alert: internal_error" I change my configuration to:

            <httpClientFactory class="com.norconex.collector.http.client.impl.GenericHttpClientFactory">
                    <!-- Be warned: trusting all certificates is usually a bad idea. -->
                    <cookiesDisabled>false</cookiesDisabled>
                    <trustAllSSLCertificates>true</trustAllSSLCertificates>
                    <expectContinueEnabled>true</expectContinueEnabled>
            </httpClientFactory>

I also modified my collector-http.sh to:

java -Djavax.net.debug=ssl -Djavax.net.ssl.trustStore=/etc/ssl/certs -Dlog4j.configuration="file:./log4j.properties" -Dfile.encoding=UTF8 -cp "./lib/*:./classes" com.norconex.collector.http.HttpCollector "$@"

in order to debug ssl methods and error specific on SSL (https) and there is no more information besides internal error.

and to collect more info because I change my log4j.properties to:

Default loggers for the collector:
log4j.logger.com.norconex.collector.http=DEBUG
log4j.logger.com.norconex.collector.core=DEBUG
log4j.logger.com.norconex.importer=DEBUG
log4j.logger.com.norconex.committer=DEBUG
log4j.logger.CrawlerEvent.CRAWLER_STARTED=DEBUG
log4j.logger.CrawlerEvent.CRAWLER_RESUMED=DEBUG
log4j.logger.CrawlerEvent.CRAWLER_FINISHED=DEBUG
log4j.logger.CrawlerEvent.REJECTED_DUPLICATE=DEBUG
log4j.logger.CrawlerEvent.REJECTED_FILTER=DEBUG
log4j.logger.CrawlerEvent.REJECTED_UNMODIFIED=INFO
log4j.logger.CrawlerEvent.REJECTED_NOTFOUND=INFO
log4j.logger.CrawlerEvent.REJECTED_BAD_STATUS=DEBUG
log4j.logger.CrawlerEvent.REJECTED_IMPORT=DEBUG
log4j.logger.CrawlerEvent.REJECTED_ERROR=DEBUG
log4j.logger.CrawlerEvent.DOCUMENT_PREIMPORTED=INFO
log4j.logger.CrawlerEvent.DOCUMENT_POSTIMPORTED=INFO
log4j.logger.CrawlerEvent.DOCUMENT_COMMITTED_ADD=INFO
log4j.logger.CrawlerEvent.DOCUMENT_COMMITTED_REMOVED=INFO
log4j.logger.CrawlerEvent.DOCUMENT_IMPORTED=INFO
log4j.logger.CrawlerEvent.DOCUMENT_METADATA_FETCHED=INFO
log4j.logger.CrawlerEvent.DOCUMENT_FETCHED=INFO
log4j.logger.CrawlerEvent.DOCUMENT_SAVED=INFO

OS report: Linux user-EON17-SLX 4.4.0-166-generic #195-Ubuntu SMP Tue Oct 1 09:35:25 UTC 2019 x86_64 x86_64 x86_64 GNU/Linux

java version: openjdk version "9-internal" OpenJDK Runtime Environment (build 9-internal+0-2016-04-14-195246.buildd.src) OpenJDK 64-Bit Server VM (build 9-internal+0-2016-04-14-195246.buildd.src, mixed mode)

INFO  [GenericDocumentFetcher] Cannot fetch document: https://www.eltiempo.com/unidad-investigativa/alex-saab-carta-del-regimen-pidiendo-inmunidad-y-liberacion-515566 (Received fatal alert: internal_error)
javax.net.ssl.SSLException: Received fatal alert: internal_error
    at sun.security.ssl.Alerts.getSSLException(Alerts.java:208)
    at sun.security.ssl.Alerts.getSSLException(Alerts.java:154)
    at sun.security.ssl.SSLSocketImpl.recvAlert(SSLSocketImpl.java:2020)
    at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:1127)
    at sun.security.ssl.SSLSocketImpl.performInitialHandshake(SSLSocketImpl.java:1367)
    at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:1395)
    at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:1379)
    at org.apache.http.conn.ssl.SSLConnectionSocketFactory.createLayeredSocket(SSLConnectionSocketFactory.java:396)
    at org.apache.http.conn.ssl.SSLConnectionSocketFactory.connectSocket(SSLConnectionSocketFactory.java:355)
    at org.apache.http.impl.conn.DefaultHttpClientConnectionOperator.connect(DefaultHttpClientConnectionOperator.java:142)
    at org.apache.http.impl.conn.PoolingHttpClientConnectionManager.connect(PoolingHttpClientConnectionManager.java:359)
    at org.apache.http.impl.execchain.MainClientExec.establishRoute(MainClientExec.java:381)
    at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:237)
    at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185)
    at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89)
    at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:111)
    at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83)
    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56)
    at com.norconex.collector.http.fetch.impl.GenericDocumentFetcher.fetchDocument(GenericDocumentFetcher.java:219)
    at com.norconex.collector.http.pipeline.importer.DocumentFetcherStage.executeStage(DocumentFetcherStage.java:42)
    at com.norconex.collector.http.pipeline.importer.AbstractImporterStage.execute(AbstractImporterStage.java:31)
    at com.norconex.collector.http.pipeline.importer.AbstractImporterStage.execute(AbstractImporterStage.java:24)
    at com.norconex.commons.lang.pipeline.Pipeline.execute(Pipeline.java:91)
    at com.norconex.collector.http.crawler.HttpCrawler.executeImporterPipeline(HttpCrawler.java:361)
    at com.norconex.collector.core.crawler.AbstractCrawler.processNextQueuedCrawlData(AbstractCrawler.java:538)
    at com.norconex.collector.core.crawler.AbstractCrawler.processNextReference(AbstractCrawler.java:419)
    at com.norconex.collector.core.crawler.AbstractCrawler$ProcessReferencesRunnable.run(AbstractCrawler.java:829)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

Could you please help me? thanks a lot Angelo

essiembre commented 4 years ago

I cannot reproduce. I was able to crawl the start URL and also https://www.eltiempo.com/unidad-investigativa/alex-saab-carta-del-regimen-pidiendo-inmunidad-y-liberacion-515566 without having to enable trustAllSSLCertificates. Are you still experiencing the same issue? If so, please share your config to reproduce.