Closed alberttwong closed 7 months ago
Answer. have you add another option
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
import scala.collection.JavaConversions._
val df = spark.read.parquet("s3://huditest/green_tripdata_2023-01.parquet")
val databaseName = "hudi_sample"
val tableName = "hudi_coders_hive"
val basePath = "s3a://huditest/hudi_coders"
df.write.format("hudi").
option(org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME, tableName).
option(RECORDKEY_FIELD_OPT_KEY, "lpep_pickup_datetime").
option(PRECOMBINE_FIELD_OPT_KEY, "lpep_pickup_datetime").
option("hoodie.datasource.hive_sync.enable", "true").
option("hoodie.datasource.hive_sync.mode", "hms").
option("hoodie.datasource.hive_sync.database", databaseName).
option("hoodie.datasource.hive_sync.table", tableName).
option("hoodie.datasource.hive_sync.metastore.uris", "thrift://hive-metastore:9083").
option("fs.defaultFS", "s3://huditest/").
mode(Overwrite).
save(basePath)
Using the NYC taxi data. I don't define the bucket s3://datalake anywhere in my config files.