Open tcluzhe opened 2 years ago
code like this, while spline cannot get Input Data Source
Input Data Source
from pyspark.sql import SparkSession spark = (SparkSession.builder .config('spark.sql.queryExecutionListeners', 'za.co.absa.spline.harvester.listener.SplineQueryExecutionListener') .config('spark.spline.producer.url', 'http://master-1-1:8080/producer') .enableHiveSupport() .getOrCreate() ) def generate_data(): data = [ ('a', 1), ('b', 2), ] df = spark.createDataFrame(data, ['name', 'value']) df.write.saveAsTable('test.table', mode='overwrite') def test(): spark.sparkContext.setCheckpointDir('/tmp/checkpoint') df = spark.table('test.table') df = df.checkpoint() ## checkpoint df.write.saveAsTable('test.table2', mode='overwrite') if __name__ == '__main__': # generate_data() test()
+1 we would definitely appreciate this feature too (either way we're loosing a whole chunk of lineage)
code like this, while spline cannot get
Input Data Source