keeps / dbptk-ui

DBPTK base UI for both Desktop and Enterprise
https://database-preservation.com
GNU Lesser General Public License v3.0
23 stars 9 forks source link

Environment variable to disable Tika AutoDetectParser characters limit #372

Closed hmiguim closed 6 months ago

hmiguim commented 8 months ago

Problem:

2023-06-05 10:03:47,969 [http-nio-8080-exec-3] ERROR c.d.c.t.ToolkitStructure2ViewerStructure - Could not calculate mimeType for special extensions in the cell: [record167.bin]
org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException: Your document contained more than 100000 characters, and so your requested limit has been reached. To receive the full text of the document, increase your limit. (Text up to the limit is however available).
at org.apache.tika.sax.WriteOutContentHandler.characters(WriteOutContentHandler.java:141)
at org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:146)
at org.apache.tika.sax.xpath.MatchingContentHandler.characters(MatchingContentHandler.java:85)
at org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:146)
at org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:146)
at org.apache.tika.sax.SecureContentHandler.characters(SecureContentHandler.java:270)
at org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:146)
at org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:146)
at org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:146)
at org.apache.tika.sax.SafeContentHandler.access$001(SafeContentHandler.java:47)
at org.apache.tika.sax.SafeContentHandler$1.write(SafeContentHandler.java:83)
at org.apache.tika.sax.SafeContentHandler.filter(SafeContentHandler.java:141)
at org.apache.tika.sax.SafeContentHandler.characters(SafeContentHandler.java:288)
at org.apache.tika.sax.XHTMLContentHandler.characters(XHTMLContentHandler.java:284)
at org.apache.tika.sax.XHTMLContentHandler.characters(XHTMLContentHandler.java:311)
at org.apache.tika.parser.microsoft.TextCell.render(TextCell.java:34)
at org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.processSheet(ExcelExtractor.java:562)
at org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.internalProcessRecord(ExcelExtractor.java:372)
at org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.processRecord(ExcelExtractor.java:323)
at org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener.processRecord(FormatTrackingHSSFListener.java:85)
at org.apache.poi.hssf.eventusermodel.HSSFRequest.processRecord(HSSFRequest.java:112)
at org.apache.poi.hssf.eventusermodel.HSSFEventFactory.genericProcessEvents(HSSFEventFactory.java:147)
at org.apache.poi.hssf.eventusermodel.HSSFEventFactory.processEvents(HSSFEventFactory.java:106)
at org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.processFile(ExcelExtractor.java:299)
at org.apache.tika.parser.microsoft.ExcelExtractor.parse(ExcelExtractor.java:151)
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:194)
at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:161)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)
at com.databasepreservation.common.transformers.ToolkitStructure2ViewerStructure.detectMimeType(ToolkitStructure2ViewerStructure.java:925)
at com.databasepreservation.common.transformers.ToolkitStructure2ViewerStructure.getCell(ToolkitStructure2ViewerStructure.java:850)
at com.databasepreservation.common.transformers.ToolkitStructure2ViewerStructure.getCells(ToolkitStructure2ViewerStructure.java:781)
at com.databasepreservation.common.transformers.ToolkitStructure2ViewerStructure.getRow(ToolkitStructure2ViewerStructure.java:762)
at com.databasepreservation.modules.viewer.DbvtkExportModule.handleDataRow(DbvtkExportModule.java:135)
at com.databasepreservation.model.modules.filters.IdentityFilter.handleDataRow(IdentityFilter.java:88)
at com.databasepreservation.model.modules.filters.ObservableFilter.handleDataRow(ObservableFilter.java:123)
at com.databasepreservation.modules.siard.in.content.SIARD2ContentImportStrategy.endElement(SIARD2ContentImportStrategy.java:383)
at org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source)
at org.apache.xerces.impl.xs.XMLSchemaValidator.endElement(Unknown Source)
at org.apache.xerces.impl.XMLNSDocumentScannerImpl.scanEndElement(Unknown Source)
at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContentDispatcher.dispatch(Unknown Source)
at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source)
at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source)
at org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser.parse(Unknown Source)
at com.databasepreservation.modules.siard.in.content.SIARD2ContentImportStrategy.importContent(SIARD2ContentImportStrategy.java:188)
at com.databasepreservation.modules.siard.in.input.SIARDImportDefault.migrateDatabaseTo(SIARDImportDefault.java:64)
at com.databasepreservation.DatabaseMigration.migrate(DatabaseMigration.java:123)
at com.databasepreservation.common.server.controller.SIARDController.convertSIARDtoSolr(SIARDController.java:713)
at com.databasepreservation.common.server.controller.SIARDController.loadFromLocal(SIARDController.java:676)
at com.databasepreservation.common.api.v1.CollectionResource.createCollection(CollectionResource.java:197)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.glassfish.jersey.server.model.internal.ResourceMethodInvocationHandlerFactory.lambda$static$0(ResourceMethodInvocationHandlerFactory.java:52)
at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher$1.run(AbstractJavaResourceMethodDispatcher.java:124)
at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.invoke(AbstractJavaResourceMethodDispatcher.java:167)
at org.glassfish.jersey.server.model.internal.JavaResourceMethodDispatcherProvider$TypeOutInvoker.doDispatch(JavaResourceMethodDispatcherProvider.java:219)
at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.dispatch(AbstractJavaResourceMethodDispatcher.java:79)
at org.glassfish.jersey.server.model.ResourceMethodInvoker.invoke(ResourceMethodInvoker.java:475)
at org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:397)
at org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:81)
at org.glassfish.jersey.server.ServerRuntime$1.run(ServerRuntime.java:255)
at org.glassfish.jersey.internal.Errors$1.call(Errors.java:248)
at org.glassfish.jersey.internal.Errors$1.call(Errors.java:244)
at org.glassfish.jersey.internal.Errors.process(Errors.java:292)
at org.glassfish.jersey.internal.Errors.process(Errors.java:274)
at org.glassfish.jersey.internal.Errors.process(Errors.java:244)
at org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:265)
at org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:234)
at org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:684)
at org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:394)
at org.glassfish.jersey.servlet.ServletContainer.serviceImpl(ServletContainer.java:386)
at org.glassfish.jersey.servlet.ServletContainer.doFilter(ServletContainer.java:561)
at org.glassfish.jersey.servlet.ServletContainer.doFilter(ServletContainer.java:502)
at org.glassfish.jersey.servlet.ServletContainer.doFilter(ServletContainer.java:439)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:189)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:162)
at org.springframework.web.filter.RequestContextFilter.doFilterInternal(RequestContextFilter.java:100)
at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:117)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:189)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:162)
at org.springframework.boot.actuate.metrics.web.servlet.WebMvcMetricsFilter.doFilterInternal(WebMvcMetricsFilter.java:96)
at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:117)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:189)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:162)
at org.springframework.web.filter.CharacterEncodingFilter.doFilterInternal(CharacterEncodingFilter.java:201)
at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:117)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:189)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:162)
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:197)
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:97)
at org.apache.catalina.authenticator.AuthenticatorBase.invoke(AuthenticatorBase.java:541)
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:135)
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:92)
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:78)
at org.apache.catalina.valves.RemoteIpValve.invoke(RemoteIpValve.java:769)
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:360)
at org.apache.coyote.http11.Http11Processor.service(Http11Processor.java:399)
at org.apache.coyote.AbstractProcessorLight.process(AbstractProcessorLight.java:65)
at org.apache.coyote.AbstractProtocol$ConnectionHandler.process(AbstractProtocol.java:890)
at org.apache.tomcat.util.net.NioEndpoint$SocketProcessor.doRun(NioEndpoint.java:1743)
at org.apache.tomcat.util.net.SocketProcessorBase.run(SocketProcessorBase.java:49)
at org.apache.tomcat.util.threads.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1191)
at org.apache.tomcat.util.threads.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:659)
at org.apache.tomcat.util.threads.TaskThread$WrappingRunnable.run(TaskThread.java:61)
at java.lang.Thread.run(Thread.java:750)
2023-06-05 10:03:48,713 [http-nio-8080-exec-3] ERROR c.d.c.t.ToolkitStructure2ViewerStructure - Could not calculate mimeType for special extensions in the cell: [record193.bin]

Solution

Create an environment variable to disable the character limit. By default it'll have the 100K character limit. If set to true the character limit will be disabled.