i am trying to build (spark)a cube on kylin with a join between two huge tables , can you help me out with that please
java.lang.OutOfMemoryError: Java heap space
at java.util.IdentityHashMap.resize(IdentityHashMap.java:471)
at java.util.IdentityHashMap.put(IdentityHashMap.java:440)
at org.apache.kylin.dict.TrieDictionaryBuilder.buildTrieBytes(TrieDictionaryBuilder.java:476)
at org.apache.kylin.dict.TrieDictionaryBuilder.build(TrieDictionaryBuilder.java:418)
at org.apache.kylin.dict.lookup.SnapshotTable.takeSnapshot(SnapshotTable.java:98)
at org.apache.kylin.dict.lookup.SnapshotManager.buildSnapshot(SnapshotManager.java:139)
at org.apache.kylin.cube.CubeManager.buildSnapshotTable(CubeManager.java:287)
at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:87)
at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:49)
at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(CreateDictionaryJob.java:66)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84)
at org.apache.kylin.engine.mr.common.HadoopShellExecutable.doWork(HadoopShellExecutable.java:62)
at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:125)
at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(DefaultChainedExecutable.java:64)
at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:125)
at org.apache.kylin.job.impl.threadpool.DefaultScheduler$JobRunner.run(DefaultScheduler.java:144)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
i am trying to build (spark)a cube on kylin with a join between two huge tables , can you help me out with that please java.lang.OutOfMemoryError: Java heap space at java.util.IdentityHashMap.resize(IdentityHashMap.java:471) at java.util.IdentityHashMap.put(IdentityHashMap.java:440) at org.apache.kylin.dict.TrieDictionaryBuilder.buildTrieBytes(TrieDictionaryBuilder.java:476) at org.apache.kylin.dict.TrieDictionaryBuilder.build(TrieDictionaryBuilder.java:418) at org.apache.kylin.dict.lookup.SnapshotTable.takeSnapshot(SnapshotTable.java:98) at org.apache.kylin.dict.lookup.SnapshotManager.buildSnapshot(SnapshotManager.java:139) at org.apache.kylin.cube.CubeManager.buildSnapshotTable(CubeManager.java:287) at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:87) at org.apache.kylin.cube.cli.DictionaryGeneratorCLI.processSegment(DictionaryGeneratorCLI.java:49) at org.apache.kylin.engine.mr.steps.CreateDictionaryJob.run(CreateDictionaryJob.java:66) at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70) at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:84) at org.apache.kylin.engine.mr.common.HadoopShellExecutable.doWork(HadoopShellExecutable.java:62) at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:125) at org.apache.kylin.job.execution.DefaultChainedExecutable.doWork(DefaultChainedExecutable.java:64) at org.apache.kylin.job.execution.AbstractExecutable.execute(AbstractExecutable.java:125) at org.apache.kylin.job.impl.threadpool.DefaultScheduler$JobRunner.run(DefaultScheduler.java:144) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)
this is my spark config
kylin.env.hadoop-conf-dir=/home/hadoop/apache-kylin-2.2.0-bin/hadoop-conf #
Estimate the RDD partition numbers
kylin.engine.spark.rdd-partition-cut-mb=10
#
Minimal partition numbers of rdd
kylin.engine.spark.min-partition=1
#
Max partition numbers of rdd
kylin.engine.spark.max-partition=5000
#
Spark conf (default is in spark/conf/spark-defaults.conf)
kylin.engine.spark-conf.spark.master=yarn
kylin.engine.spark-conf.spark.submit.deployMode=cluster
kylin.engine.spark-conf.spark.yarn.queue=default
kylin.engine.spark-conf.spark.executor.memory=20G kylin.engine.spark-conf.spark.executor.cores=8 kylin.engine.spark-conf.spark.executor.instances=6