Unable to parse certain office documents using mapper-attachment plugin
Steps to reproduce:
1.Index office documents/PDF's
2.
3.
Provide logs (if relevant):
[2016-08-31 08:59:56,296][WARN ][org.apache.tika.parser.microsoft.AbstractPOIFSExtractor] Ignoring unexpected exception while parsing summary entry SummaryInformation
java.io.UnsupportedEncodingException: Codepage number may not be 0
at org.apache.poi.util.CodePageUtil.codepageToEncoding(CodePageUtil.java:277)
at org.apache.poi.util.CodePageUtil.codepageToEncoding(CodePageUtil.java:255)
at org.apache.poi.util.CodePageUtil.getStringFromCodePage(CodePageUtil.java:233)
at org.apache.poi.util.CodePageUtil.getStringFromCodePage(CodePageUtil.java:221)
at org.apache.poi.hpsf.CodePageString.getJavaValue(CodePageString.java:70)
at org.apache.poi.hpsf.VariantSupport.read(VariantSupport.java:210)
at org.apache.poi.hpsf.Property.(Property.java:163)
at org.apache.poi.hpsf.Section.(Section.java:277)
at org.apache.poi.hpsf.PropertySet.init(PropertySet.java:451)
at org.apache.poi.hpsf.PropertySet.(PropertySet.java:246)
at org.apache.tika.parser.microsoft.SummaryExtractor.parseSummaryEntryIfExists(SummaryExtractor.java:83)
at org.apache.tika.parser.microsoft.SummaryExtractor.parseSummaries(SummaryExtractor.java:73)
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:126)
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:117)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
at org.apache.tika.Tika.parseToString(Tika.java:537)
at org.elasticsearch.mapper.attachments.TikaImpl$1.run(TikaImpl.java:94)
at org.elasticsearch.mapper.attachments.TikaImpl$1.run(TikaImpl.java:91)
at java.security.AccessController.doPrivileged(Native Method)
at org.elasticsearch.mapper.attachments.TikaImpl.parse(TikaImpl.java:91)
at org.elasticsearch.mapper.attachments.AttachmentMapper.parse(AttachmentMapper.java:481)
at org.elasticsearch.index.mapper.DocumentParser.parseObjectOrField(DocumentParser.java:309)
at org.elasticsearch.index.mapper.DocumentParser.parseValue(DocumentParser.java:436)
at org.elasticsearch.index.mapper.DocumentParser.parseObject(DocumentParser.java:262)
at org.elasticsearch.index.mapper.DocumentParser.parseDocument(DocumentParser.java:122)
at org.elasticsearch.index.mapper.DocumentMapper.parse(DocumentMapper.java:309)
at org.elasticsearch.index.shard.IndexShard.prepareIndex(IndexShard.java:580)
at org.elasticsearch.index.shard.IndexShard.prepareIndexOnPrimary(IndexShard.java:559)
at org.elasticsearch.action.index.TransportIndexAction.prepareIndexOperationOnPrimary(TransportIndexAction.java:212)
at org.elasticsearch.action.index.TransportIndexAction.executeIndexRequestOnPrimary(TransportIndexAction.java:224)
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardIndexOperation(TransportShardBulkAction.java:326)
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:119)
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:68)
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryPhase.doRun(TransportReplicationAction.java:639)
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:279)
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:271)
at org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:75)
at org.elasticsearch.transport.TransportService$4.doRun(TransportService.java:376)
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Log2:
[2016-08-31 08:56:17,258][ERROR][org.apache.pdfbox.filter.FlateFilter] FlateFilter: stop reading corrupt stream due to a DataFormatException
[2016-08-31 08:56:54,058][ERROR][org.apache.pdfbox.filter.FlateFilter] FlateFilter: stop reading corrupt stream due to a DataFormatException
[2016-08-31 08:56:54,106][ERROR][org.apache.pdfbox.filter.FlateFilter] FlateFilter: stop reading corrupt stream due to a DataFormatException
[2016-08-31 08:56:54,248][ERROR][org.apache.pdfbox.filter.FlateFilter] FlateFilter: stop reading corrupt stream due to a DataFormatException
[2016-08-31 08:56:54,265][ERROR][org.apache.pdfbox.filter.FlateFilter] FlateFilter: stop reading corrupt stream due to a DataFormatException
Elasticsearch version: ES 2.3.2
Plugins installed: [mapper-plugin]
JVM version: jdk 1.8
OS version: Linux /windows 7
Unable to parse certain office documents using mapper-attachment plugin
Steps to reproduce: 1.Index office documents/PDF's 2. 3.
Provide logs (if relevant):
[2016-08-31 08:59:56,296][WARN ][org.apache.tika.parser.microsoft.AbstractPOIFSExtractor] Ignoring unexpected exception while parsing summary entry SummaryInformation java.io.UnsupportedEncodingException: Codepage number may not be 0 at org.apache.poi.util.CodePageUtil.codepageToEncoding(CodePageUtil.java:277) at org.apache.poi.util.CodePageUtil.codepageToEncoding(CodePageUtil.java:255) at org.apache.poi.util.CodePageUtil.getStringFromCodePage(CodePageUtil.java:233) at org.apache.poi.util.CodePageUtil.getStringFromCodePage(CodePageUtil.java:221) at org.apache.poi.hpsf.CodePageString.getJavaValue(CodePageString.java:70) at org.apache.poi.hpsf.VariantSupport.read(VariantSupport.java:210) at org.apache.poi.hpsf.Property.(Property.java:163)
at org.apache.poi.hpsf.Section.(Section.java:277)
at org.apache.poi.hpsf.PropertySet.init(PropertySet.java:451)
at org.apache.poi.hpsf.PropertySet.(PropertySet.java:246)
at org.apache.tika.parser.microsoft.SummaryExtractor.parseSummaryEntryIfExists(SummaryExtractor.java:83)
at org.apache.tika.parser.microsoft.SummaryExtractor.parseSummaries(SummaryExtractor.java:73)
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:126)
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:117)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
at org.apache.tika.Tika.parseToString(Tika.java:537)
at org.elasticsearch.mapper.attachments.TikaImpl$1.run(TikaImpl.java:94)
at org.elasticsearch.mapper.attachments.TikaImpl$1.run(TikaImpl.java:91)
at java.security.AccessController.doPrivileged(Native Method)
at org.elasticsearch.mapper.attachments.TikaImpl.parse(TikaImpl.java:91)
at org.elasticsearch.mapper.attachments.AttachmentMapper.parse(AttachmentMapper.java:481)
at org.elasticsearch.index.mapper.DocumentParser.parseObjectOrField(DocumentParser.java:309)
at org.elasticsearch.index.mapper.DocumentParser.parseValue(DocumentParser.java:436)
at org.elasticsearch.index.mapper.DocumentParser.parseObject(DocumentParser.java:262)
at org.elasticsearch.index.mapper.DocumentParser.parseDocument(DocumentParser.java:122)
at org.elasticsearch.index.mapper.DocumentMapper.parse(DocumentMapper.java:309)
at org.elasticsearch.index.shard.IndexShard.prepareIndex(IndexShard.java:580)
at org.elasticsearch.index.shard.IndexShard.prepareIndexOnPrimary(IndexShard.java:559)
at org.elasticsearch.action.index.TransportIndexAction.prepareIndexOperationOnPrimary(TransportIndexAction.java:212)
at org.elasticsearch.action.index.TransportIndexAction.executeIndexRequestOnPrimary(TransportIndexAction.java:224)
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardIndexOperation(TransportShardBulkAction.java:326)
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:119)
at org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:68)
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryPhase.doRun(TransportReplicationAction.java:639)
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:279)
at org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler.messageReceived(TransportReplicationAction.java:271)
at org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:75)
at org.elasticsearch.transport.TransportService$4.doRun(TransportService.java:376)
at org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Log2:
[2016-08-31 08:56:17,258][ERROR][org.apache.pdfbox.filter.FlateFilter] FlateFilter: stop reading corrupt stream due to a DataFormatException [2016-08-31 08:56:54,058][ERROR][org.apache.pdfbox.filter.FlateFilter] FlateFilter: stop reading corrupt stream due to a DataFormatException [2016-08-31 08:56:54,106][ERROR][org.apache.pdfbox.filter.FlateFilter] FlateFilter: stop reading corrupt stream due to a DataFormatException [2016-08-31 08:56:54,248][ERROR][org.apache.pdfbox.filter.FlateFilter] FlateFilter: stop reading corrupt stream due to a DataFormatException [2016-08-31 08:56:54,265][ERROR][org.apache.pdfbox.filter.FlateFilter] FlateFilter: stop reading corrupt stream due to a DataFormatException