platonai / exotic-amazon

A complete solution to crawl amazon at scale completely and accurately.
143 stars 46 forks source link

ERROR a.p.pulsar.crawl.parse.PageParser - java.lang.StackOverflowError #4

Closed platonai closed 1 year ago

platonai commented 1 year ago

02:14:41.639 [r-worker-3] ERROR a.p.pulsar.crawl.parse.PageParser - java.lang.StackOverflowError at ch.qos.logback.classic.spi.LoggingEvent.(LoggingEvent.java:119) at ch.qos.logback.classic.Logger.buildLoggingEventAndAppend(Logger.java:419) at ch.qos.logback.classic.Logger.filterAndLog_0_Or3Plus(Logger.java:383) at ch.qos.logback.classic.Logger.warn(Logger.java:692) at ai.platon.pulsar.crawl.parse.PageParser.doParse(PageParser.kt:160) at ai.platon.pulsar.crawl.parse.PageParser.parse(PageParser.kt:107) at ai.platon.pulsar.crawl.component.ParseComponent.parse(ParseComponent.kt:54) at ai.platon.pulsar.crawl.component.ParseComponent.parse$default(ParseComponent.kt:52) at ai.platon.pulsar.context.support.AbstractPulsarContext.parse(AbstractPulsarContext.kt:413) at ai.platon.pulsar.session.AbstractPulsarSession.parse0(AbstractPulsarSession.kt:555) at ai.platon.pulsar.session.AbstractPulsarSession.parse(AbstractPulsarSession.kt:386) at ai.platon.scent.parse.html.AbstractSQLExtractor.doFilter(AbstractSQLExtractor.kt:177) at ai.platon.pulsar.crawl.parse.AbstractParseFilter.filter(AbstractParseFilter.kt:58) at ai.platon.pulsar.crawl.parse.ParseFilters.filter(ParseFilters.kt:67) at ai.platon.pulsar.parse.tika.TikaParser.parse(TikaParser.kt:136) at ai.platon.pulsar.crawl.parse.PageParser$runParser$1$1$deferred$1.invokeSuspend(PageParser.kt:223) at kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33) at kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:106) at kotlinx.coroutines.EventLoopImplBase.processNextEvent(EventLoop.common.kt:274) at kotlinx.coroutines.BlockingCoroutine.joinBlocking(Builders.kt:85) at kotlinx.coroutines.BuildersKtBuildersKt.runBlocking(Builders.kt:59) at kotlinx.coroutines.BuildersKt.runBlocking(Unknown Source) at kotlinx.coroutines.BuildersKtBuildersKt.runBlocking$default(Builders.kt:38) at kotlinx.coroutines.BuildersKt.runBlocking$default(Unknown Source) at ai.platon.pulsar.crawl.parse.PageParser.runParser(PageParser.kt:221) at ai.platon.pulsar.crawl.parse.PageParser.applyParsers(PageParser.kt:202) at ai.platon.pulsar.crawl.parse.PageParser.doParse(PageParser.kt:154) at ai.platon.pulsar.crawl.parse.PageParser.parse(PageParser.kt:107) at ai.platon.pulsar.crawl.component.ParseComponent.parse(ParseComponent.kt:54) at ai.platon.pulsar.crawl.component.ParseComponent.parse$default(ParseComponent.kt:52) at ai.platon.pulsar.context.support.AbstractPulsarContext.parse(AbstractPulsarContext.kt:413) at ai.platon.pulsar.session.AbstractPulsarSession.parse0(AbstractPulsarSession.kt:555) at ai.platon.pulsar.session.AbstractPulsarSession.parse(AbstractPulsarSession.kt:386) at ai.platon.scent.parse.html.AbstractSQLExtractor.doFilter(AbstractSQLExtractor.kt:177) at ai.platon.pulsar.crawl.parse.AbstractParseFilter.filter(AbstractParseFilter.kt:58) at ai.platon.pulsar.crawl.parse.ParseFilters.filter(ParseFilters.kt:67) at ai.platon.pulsar.parse.tika.TikaParser.parse(TikaParser.kt:136) at ai.platon.pulsar.crawl.parse.PageParser$runParser$1$1$deferred$1.invokeSuspend(PageParser.kt:223) at kotlin.coroutines.jvm.internal.BaseContinuationImpl.resumeWith(ContinuationImpl.kt:33) at kotlinx.coroutines.DispatchedTask.run(DispatchedTask.kt:106) at kotlinx.coroutines.EventLoopImplBase.processNextEvent(EventLoop.common.kt:274) at kotlinx.coroutines.BlockingCoroutine.joinBlocking(Builders.kt:85) at kotlinx.coroutines.BuildersKtBuildersKt.runBlocking(Builders.kt:59) at kotlinx.coroutines.BuildersKt.runBlocking(Unknown Source) at kotlinx.coroutines.BuildersKtBuildersKt.runBlocking$default(Builders.kt:38) at kotlinx.coroutines.BuildersKt.runBlocking$default(Unknown Source) at ai.platon.pulsar.crawl.parse.PageParser.runParser(PageParser.kt:221) at ai.platon.pulsar.crawl.parse.PageParser.applyParsers(PageParser.kt:202) at ai.platon.pulsar.crawl.parse.PageParser.doParse(PageParser.kt:154) at ai.platon.pulsar.crawl.parse.PageParser.parse(PageParser.kt:107)

platonai commented 1 year ago

Caused by doFilter -> session.parse

override fun doFilter(parseContext: ParseContext): FilterResult {
    ....

    return try {
        val document = parseContext.parseResult.document ?: session.parse(page, noCache = true)    // CAUSED BY parse
        ...
    } catch (e: IllegalApplicationContextStateException) {
        ....
    }
}
platonai commented 1 year ago

One more case:

17:55:46.436 [r-worker-4] WARN a.p.pulsar.crawl.parse.ParseFilters - Unexpected exception java.lang.StackOverflowError: null at java.base/java.util.HashMap.hash(HashMap.java:340) at java.base/java.util.HashMap.get(HashMap.java:553) at org.h2.expression.Function.getFunctionInfo(Function.java:518) at org.h2.expression.Function.getFunction(Function.java:534) at org.h2.command.Parser.readFunction(Parser.java:2750) at org.h2.command.Parser.readTerm(Parser.java:3102) at org.h2.command.Parser.readFactor(Parser.java:2587) at org.h2.command.Parser.readSum(Parser.java:2574) at org.h2.command.Parser.readConcat(Parser.java:2544) at org.h2.command.Parser.readCondition(Parser.java:2370) at org.h2.command.Parser.readAnd(Parser.java:2342) at org.h2.command.Parser.readExpression(Parser.java:2334) at org.h2.command.Parser.readJavaFunction(Parser.java:2708) at org.h2.command.Parser.readFunction(Parser.java:2756) at org.h2.command.Parser.readTerm(Parser.java:3102) at org.h2.command.Parser.readFactor(Parser.java:2587) at org.h2.command.Parser.readSum(Parser.java:2574) at org.h2.command.Parser.readConcat(Parser.java:2544) at org.h2.command.Parser.readCondition(Parser.java:2370) at org.h2.command.Parser.readAnd(Parser.java:2342) at org.h2.command.Parser.readExpression(Parser.java:2334) at org.h2.command.Parser.readJavaFunction(Parser.java:2708) at org.h2.command.Parser.readFunction(Parser.java:2756) at org.h2.command.Parser.readTerm(Parser.java:3102) at org.h2.command.Parser.readFactor(Parser.java:2587) at org.h2.command.Parser.readSum(Parser.java:2574) at org.h2.command.Parser.readConcat(Parser.java:2544) at org.h2.command.Parser.readCondition(Parser.java:2370) at org.h2.command.Parser.readAnd(Parser.java:2342) at org.h2.command.Parser.readExpression(Parser.java:2334) at org.h2.command.Parser.parseSelectSimpleSelectPart(Parser.java:2245) at org.h2.command.Parser.parseSelectSimple(Parser.java:2277) at org.h2.command.Parser.parseSelectSub(Parser.java:2133) at org.h2.command.Parser.parseSelectUnion(Parser.java:1946) at org.h2.command.Parser.parseSelect(Parser.java:1919) at org.h2.command.Parser.parsePrepared(Parser.java:463) at org.h2.command.Parser.parse(Parser.java:335) at org.h2.command.Parser.parse(Parser.java:307) at org.h2.command.Parser.prepareCommand(Parser.java:278) at org.h2.engine.Session.prepareLocal(Session.java:626) at org.h2.engine.Session.prepareCommand(Session.java:564) at org.h2.jdbc.JdbcConnection.prepareCommand(JdbcConnection.java:1247) at org.h2.jdbc.JdbcStatement.executeQuery(JdbcStatement.java:78) at ai.platon.scent.parse.html.AbstractSQLExtractor.executeQuery0(AbstractSQLExtractor.kt:359) at ai.platon.scent.parse.html.AbstractSQLExtractor.extractWithConnection(AbstractSQLExtractor.kt:272) at ai.platon.scent.parse.html.AbstractSinkAwareSQLExtractor.extractWithConnection(AbstractSinkAwareSQLExtractor.kt:59) at ai.platon.scent.parse.html.AbstractSQLExtractor.extractWithConnection(AbstractSQLExtractor.kt:266) at ai.platon.scent.parse.html.AbstractSQLExtractor.extract(AbstractSQLExtractor.kt:237) at ai.platon.scent.parse.html.AbstractSQLExtractor.doFilter(AbstractSQLExtractor.kt:182) at ai.platon.pulsar.crawl.parse.AbstractParseFilter.filter(AbstractParseFilter.kt:58) at ai.platon.pulsar.crawl.parse.ParseFilters.filter(ParseFilters.kt:67) at ai.platon.pulsar.crawl.parse.html.PrimerHtmlParser.parse(PrimerHtmlParser.kt:74) at ai.platon.pulsar.crawl.parse.PageParser.applyParsers(PageParser.kt:202) at ai.platon.pulsar.crawl.parse.PageParser.doParse(PageParser.kt:154) at ai.platon.pulsar.crawl.parse.PageParser.parse(PageParser.kt:107) at ai.platon.pulsar.crawl.component.ParseComponent.parse(ParseComponent.kt:54) at ai.platon.pulsar.crawl.component.ParseComponent.parse$default(ParseComponent.kt:52) at ai.platon.pulsar.context.support.AbstractPulsarContext.parse(AbstractPulsarContext.kt:413) at ai.platon.pulsar.session.AbstractPulsarSession.parse0(AbstractPulsarSession.kt:564) at ai.platon.pulsar.session.AbstractPulsarSession.parse(AbstractPulsarSession.kt:386) at ai.platon.pulsar.session.PulsarSession$DefaultImpls.parse$default(PulsarSession.kt:355) at ai.platon.pulsar.session.AbstractPulsarSession.loadDocument(AbstractPulsarSession.kt:399) at ai.platon.pulsar.session.PulsarSession$DefaultImpls.loadDocument$default(PulsarSession.kt:363) at ai.platon.pulsar.ql.h2.udfs.DomFunctionTables.loadAndSelect(DomFunctionTables.kt:66) at ai.platon.pulsar.ql.h2.udfs.DomFunctionTables.loadAndSelect$default(DomFunctionTables.kt:58) at ai.platon.pulsar.ql.h2.udfs.DomFunctionTables.loadAndSelect(DomFunctionTables.kt) at jdk.internal.reflect.GeneratedMethodAccessor389.invoke(Unknown Source) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:566) at org.h2.engine.FunctionAlias$JavaMethod.getValue(FunctionAlias.java:456) at org.h2.expression.JavaFunction.getValue(JavaFunction.java:38) at org.h2.table.FunctionTable.getValueResultSet(FunctionTable.java:218) at org.h2.table.FunctionTable.getResult(FunctionTable.java:189) at org.h2.index.FunctionIndex.find(FunctionIndex.java:55) at org.h2.index.BaseIndex.find(BaseIndex.java:130) at org.h2.index.IndexCursor.find(IndexCursor.java:176) at org.h2.table.TableFilter.next(TableFilter.java:471) at org.h2.command.dml.Select$LazyResultQueryFlat.fetchNextRow(Select.java:1453) at org.h2.result.LazyResult.hasNext(LazyResult.java:79) at org.h2.result.LazyResult.next(LazyResult.java:59) at org.h2.command.dml.Select.queryFlat(Select.java:527) at org.h2.command.dml.Select.queryWithoutCache(Select.java:633) at org.h2.command.dml.Query.queryWithoutCacheLazyCheck(Query.java:114) at org.h2.command.dml.Query.query(Query.java:371) at org.h2.command.dml.Query.query(Query.java:333) at org.h2.command.CommandContainer.query(CommandContainer.java:114) at org.h2.command.Command.executeQuery(Command.java:202) at org.h2.jdbc.JdbcStatement.executeQuery(JdbcStatement.java:85) at ai.platon.scent.parse.html.AbstractSQLExtractor.executeQuery0(AbstractSQLExtractor.kt:359) at ai.platon.scent.parse.html.AbstractSQLExtractor.extractWithConnection(AbstractSQLExtractor.kt:272) at ai.platon.scent.parse.html.AbstractSinkAwareSQLExtractor.extractWithConnection(AbstractSinkAwareSQLExtractor.kt:59) at ai.platon.scent.parse.html.AbstractSQLExtractor.extractWithConnection(AbstractSQLExtractor.kt:266) at ai.platon.scent.parse.html.AbstractSQLExtractor.extract(AbstractSQLExtractor.kt:237) at ai.platon.scent.parse.html.AbstractSQLExtractor.doFilter(AbstractSQLExtractor.kt:182) at ai.platon.pulsar.crawl.parse.AbstractParseFilter.filter(AbstractParseFilter.kt:58) at ai.platon.pulsar.crawl.parse.ParseFilters.filter(ParseFilters.kt:67) at ai.platon.pulsar.parse.tika.TikaParser.parse(TikaParser.kt:136) at ai.platon.pulsar.crawl.parse.PageParser$runParser$1$1$deferred$1.invokeSuspend(PageParser.kt:223)