[SUPPORT] Hudi wants to write the database in s3://datalake

apache / hudi

Upserts, Deletes And Incremental Processing on Big Data.

Apache License 2.0

5.35k stars 2.42k forks source link

Using the NYC taxi data. I don't define the bucket s3://datalake anywhere in my config files.

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
import scala.collection.JavaConversions._

val df = spark.read.parquet("s3://huditest/green_tripdata_2023-01.parquet")

val databaseName = "hudi_sample"
val tableName = "hudi_coders_hive"
val basePath = "s3a://huditest/hudi_coders"

df.write.format("hudi").
  option(org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME, tableName).
  option(RECORDKEY_FIELD_OPT_KEY, "lpep_pickup_datetime").
  option(PRECOMBINE_FIELD_OPT_KEY, "lpep_pickup_datetime").
  option("hoodie.datasource.hive_sync.enable", "true").
  option("hoodie.datasource.hive_sync.mode", "hms").
  option("hoodie.datasource.hive_sync.database", databaseName).
  option("hoodie.datasource.hive_sync.table", tableName).
  option("hoodie.datasource.hive_sync.metastore.uris", "thrift://hive-metastore:9083").
  mode(Overwrite).
  save(basePath)

import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.Row import org.apache.spark.sql.SaveMode._ import org.apache.hudi.DataSourceReadOptions._ import org.apache.hudi.DataSourceWriteOptions._ import org.apache.hudi.config.HoodieWriteConfig._ import scala.collection.JavaConversions._ val df = spark.read.parquet("s3://huditest/green_tripdata_2023-01.parquet") val databaseName = "hudi_sample" val tableName = "hudi_coders_hive" val basePath = "s3a://huditest/hudi_coders" df.write.format("hudi"). option(org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME, tableName). option(RECORDKEY_FIELD_OPT_KEY, "lpep_pickup_datetime"). option(PRECOMBINE_FIELD_OPT_KEY, "lpep_pickup_datetime"). option("hoodie.datasource.hive_sync.enable", "true"). option("hoodie.datasource.hive_sync.mode", "hms"). option("hoodie.datasource.hive_sync.database", databaseName). option("hoodie.datasource.hive_sync.table", tableName). option("hoodie.datasource.hive_sync.metastore.uris", "thrift://hive-metastore:9083"). option("fs.defaultFS", "s3://huditest/"). mode(Overwrite). save(basePath)

apache / hudi

[SUPPORT] Hudi wants to write the database in s3://datalake #10695