Trying to read from s3 path like spark.read.format("parquet").load()
Getting exception as below
Bug description
Exception Trace
24/08/23 05:32:46 ERROR TaskResources: Task 17 failed by error:
java.lang.IllegalStateException: error loading native libraries: java.io.FileNotFoundException: x86_64/libarrow_cdata_jni.so
at org.apache.arrow.c.jni.JniLoader.load(JniLoader.java:95)
at org.apache.arrow.c.jni.JniLoader.loadRemaining(JniLoader.java:76)
at org.apache.arrow.c.jni.JniLoader.ensureLoaded(JniLoader.java:60)
at org.apache.arrow.c.jni.JniWrapper.get(JniWrapper.java:27)
at org.apache.arrow.c.SchemaExporter.export(SchemaExporter.java:117)
at org.apache.arrow.c.Data.exportField(Data.java:59)
at org.apache.arrow.c.Data.exportSchema(Data.java:75)
at org.apache.gluten.utils.ArrowAbiUtil$.exportSchema(ArrowAbiUtil.scala:134)
at org.apache.gluten.execution.RowToVeloxColumnarExec$.toColumnarBatchIterator(RowToVeloxColumnarExec.scala:125)
at org.apache.gluten.execution.RowToVeloxColumnarExec.$anonfun$doExecuteColumnarInternal$2(RowToVeloxColumnarExec.scala:72)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
at org.apache.gluten.execution.ColumnarInputRDDsWrapper.$anonfun$getIterators$1(WholeStageTransformer.scala:445)
at scala.collection.immutable.List.flatMap(List.scala:366)
at org.apache.gluten.execution.ColumnarInputRDDsWrapper.getIterators(WholeStageTransformer.scala:436)
at org.apache.gluten.execution.WholeStageZippedPartitionsRDD.$anonfun$compute$1(WholeStageZippedPartitionsRDD.scala:48)
at org.apache.gluten.utils.Arm$.withResource(Arm.scala:25)
at org.apache.gluten.metrics.GlutenTimeMetric$.millis(GlutenTimeMetric.scala:37)
at org.apache.gluten.execution.WholeStageZippedPartitionsRDD.compute(WholeStageZippedPartitionsRDD.scala:46)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
at org.apache.spark.scheduler.Task.run(Task.scala:141)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
Backend
VL (Velox)
Trying to read from s3 path like spark.read.format("parquet").load()
Getting exception as below
Bug description
Exception Trace
24/08/23 05:32:46 ERROR TaskResources: Task 17 failed by error: java.lang.IllegalStateException: error loading native libraries: java.io.FileNotFoundException: x86_64/libarrow_cdata_jni.so at org.apache.arrow.c.jni.JniLoader.load(JniLoader.java:95) at org.apache.arrow.c.jni.JniLoader.loadRemaining(JniLoader.java:76) at org.apache.arrow.c.jni.JniLoader.ensureLoaded(JniLoader.java:60) at org.apache.arrow.c.jni.JniWrapper.get(JniWrapper.java:27) at org.apache.arrow.c.SchemaExporter.export(SchemaExporter.java:117) at org.apache.arrow.c.Data.exportField(Data.java:59) at org.apache.arrow.c.Data.exportSchema(Data.java:75) at org.apache.gluten.utils.ArrowAbiUtil$.exportSchema(ArrowAbiUtil.scala:134) at org.apache.gluten.execution.RowToVeloxColumnarExec$.toColumnarBatchIterator(RowToVeloxColumnarExec.scala:125) at org.apache.gluten.execution.RowToVeloxColumnarExec.$anonfun$doExecuteColumnarInternal$2(RowToVeloxColumnarExec.scala:72) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) at org.apache.spark.rdd.RDD.iterator(RDD.scala:328) at org.apache.gluten.execution.ColumnarInputRDDsWrapper.$anonfun$getIterators$1(WholeStageTransformer.scala:445) at scala.collection.immutable.List.flatMap(List.scala:366) at org.apache.gluten.execution.ColumnarInputRDDsWrapper.getIterators(WholeStageTransformer.scala:436) at org.apache.gluten.execution.WholeStageZippedPartitionsRDD.$anonfun$compute$1(WholeStageZippedPartitionsRDD.scala:48) at org.apache.gluten.utils.Arm$.withResource(Arm.scala:25) at org.apache.gluten.metrics.GlutenTimeMetric$.millis(GlutenTimeMetric.scala:37) at org.apache.gluten.execution.WholeStageZippedPartitionsRDD.compute(WholeStageZippedPartitionsRDD.scala:46) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) at org.apache.spark.rdd.RDD.iterator(RDD.scala:328) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) at org.apache.spark.rdd.RDD.iterator(RDD.scala:328) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364) at org.apache.spark.rdd.RDD.iterator(RDD.scala:328) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93) at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161) at org.apache.spark.scheduler.Task.run(Task.scala:141) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620) at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64) at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) at java.base/java.lang.Thread.run(Thread.java:829)
Spark version
Spark-3.5.x
Spark configurations
from pyspark.sql import SparkSession spark = SparkSession.builder \ .appName("Gluten-Velox Example") \ .config("spark.plugins", "org.apache.gluten.GlutenPlugin") \ .config("spark.jars", "/home/ray/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.12.262.jar,/home/ray/.ivy2/jars/org.apache.spark_spark-hadoop-cloud_2.12-3.5.0.jar,/home/ray/.ivy2/jars/org.apache.hadoop_hadoop-aws-3.3.4.jar,https://d11-icebergtable.s3.amazonaws.com/velox/velox-latest/gluten-velox-bundle-spark3.5_2.12-ubuntu_20.04_x86_64-1.2.0.jar,https://d11-icebergtable.s3.amazonaws.com/velox/velox-latest/gluten-thirdparty-lib-ubuntu-20.04-x86_64.jar") \ .config("spark.gluten.loadLibFromJar", "true") \ .config("spark.memory.offHeap.enabled","true") \ .config("spark.memory.offHeap.size","1G") \ .config("spark.hadoop.fs.s3a.secret.key", "MGUBII62jZDNYFIpu117oP/MyON5WOEzq3KALdJr") \ .config("spark.hadoop.fs.s3a.access.key","AKIARENNYYZKYIMINT23") \ .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \ .config("spark.hadoop.fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \ .config("spark.hadoop.fs.s3a.path.style.access","false") \ .config("spark.hadoop.fs.s3a.endpoint","https://s3.us-east-1.amazonaws.com") \ .config("spark.jars.packages","com.amazonaws:aws-java-sdk-bundle:1.12.262,org.apache.hadoop:hadoop-aws:3.3.4,org.apache.spark:spark-hadoop-cloud_2.12:3.5.0") \ .getOrCreate()
System information
Arch : x86_64 CPU : Intel Xeon OS: Ubuntu
Relevant logs
No response