Closed suckowuw closed 1 month ago
found solution:
Sample code:
` import os import sys import time import delta_sharing from pyspark.sql import SparkSession
start = time.time() table_url = f"config.share#dsh.demo_tables_ext.{sys.argv[1]}"
spark = SparkSession.builder.appName("delta-sharing") \ .config("spark.jars.packages", "io.delta:delta-sharing-spark_2.12:0.7.5") \ .getOrCreate()
spark.sparkContext.setLogLevel('WARN')
client = delta_sharing.SharingClient(f"config.share") print(client.list_all_tables())
spark_df = delta_sharing.load_as_spark(table_url) print(spark_df.count(), " rows")
print("Sample rows:") spark_df.show(3)
elapsed = time.time() - start print(elapsed, " seconds elapsed, reading.") # time in seconds
start = time.time()
spark_df.write.option("delimiter", ";").option("header", True).csv(f"abfss://
elapsed = time.time() - start print(elapsed, " seconds elapsed, writing.") # time in seconds `
Run code: spark-submit --deploy-mode client --jars delta-sharing-spark_2.12-0.7.5.jar dsh_spark.py dshdelta100m
Dear all, need help don't get load_as_spark working.
spark-submit --deploy-mode client --files config.share --jars delta-core_2.13-2.1.1.jar,delta-sharing-spark_2.13-0.7.5.jar dsh_spark.py
import os import delta_sharing from pyspark.sql import SparkSession
client = delta_sharing.SharingClient(f"config.share") print(client.list_all_tables())
table_url = f"config.share#dsh.demo_tables_ext.dshdelta1m"
spark = SparkSession.builder.appName("delta-sharing") \ .config("spark.jars.packages", "io.delta:delta-sharing-spark_2.13-0.7.5") \ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \ .getOrCreate()
spark_df = delta_sharing.load_as_spark(table_url)
print(len(spark_df), " rows") print("Sample rows:") spark_df.show(3)
Traceback (most recent call last):
File "dsh_spark.py", line 13, in
File "/app/localstorage/skylab/python-dev/lib/python3.7/site-packages/delta_sharing/delta_sharing.py", line 163, in load_as_spark
File "/opt/cloudera/parcels/CDH-7.2.15-1.cdh7.2.15.p3.30895120/lib/spark3/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 158, in load
File "/opt/cloudera/parcels/CDH-7.2.15-1.cdh7.2.15.p3.30895120/lib/spark3/python/lib/py4j-0.10.9.3-src.zip/py4j/java_gateway.py", line 1322, in call
File "/opt/cloudera/parcels/CDH-7.2.15-1.cdh7.2.15.p3.30895120/lib/spark3/python/lib/pyspark.zip/pyspark/sql/utils.py", line 111, in deco
File "/opt/cloudera/parcels/CDH-7.2.15-1.cdh7.2.15.p3.30895120/lib/spark3/python/lib/py4j-0.10.9.3-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o91.load.
: java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.delta.sources.DeltaDataSource could not be instantiated
Caused by: java.lang.NoClassDefFoundError: scala/collection/IterableOnce
Caused by: java.lang.ClassNotFoundException: scala.collection.IterableOnce