StarRocks / demo

Apache License 2.0
85 stars 57 forks source link

datalakehouse tutorial with hudi, iceberg, delta lake external catalog with conversion done through onetable #54

Closed alberttwong closed 8 months ago

alberttwong commented 8 months ago

Using https://github.com/StarRocks/demo/tree/master/documentation-samples/datalakehouse and https://github.com/StarRocks/demo/pull/56

Create huditest bucket.

yum install -y python3
rm -f /spark-3.2.1-bin-hadoop3.2/jars/hudi-spark3-bundle_2.12-0.11.1.jar
export SPARK_VERSION=3.2
pyspark \
  --packages org.apache.hudi:hudi-spark3.2-bundle_2.12:0.14.1 \
  --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
  --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog" \
  --conf "spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension"
from pyspark.sql.types import *

# initialize the bucket
table_name = "people"
local_base_path = "s3://huditest/hudi-dataset"
databaseName = "hudi_onetable"

records = [
   (1, 'John', 25, 'NYC', '2023-09-28 00:00:00'),
   (2, 'Emily', 30, 'SFO', '2023-09-28 00:00:00'),
   (3, 'Michael', 35, 'ORD', '2023-09-28 00:00:00'),
   (4, 'Andrew', 40, 'NYC', '2023-10-28 00:00:00'),
   (5, 'Bob', 28, 'SEA', '2023-09-23 00:00:00'),
   (6, 'Charlie', 31, 'DFW', '2023-08-29 00:00:00')
]

schema = StructType([
   StructField("id", IntegerType(), True),
   StructField("name", StringType(), True),
   StructField("age", IntegerType(), True),
   StructField("city", StringType(), True),
   StructField("create_ts", StringType(), True)
])

df = spark.createDataFrame(records, schema)

hudi_options = {
   'hoodie.table.name': table_name,
   'hoodie.datasource.write.partitionpath.field': 'city',
   'hoodie.datasource.write.hive_style_partitioning': 'true',
   'hoodie.datasource.hive_sync.enable': 'true',
   'hoodie.datasource.hive_sync.mode': 'hms',
   'hoodie.datasource.hive_sync.database': databaseName,
   'hoodie.datasource.hive_sync.table': table_name,
   'hoodie.datasource.hive_sync.metastore.uris': 'thrift://hive-metastore:9083'
}

(
   df.write
   .format("hudi")
   .options(**hudi_options)
   .mode("Overwrite")
   .save(f"{local_base_path}/{table_name}")
)
CREATE EXTERNAL CATALOG hudi_catalog_hms
PROPERTIES
(
    "type" = "hudi",
    "hive.metastore.type" = "hive",
    "hive.metastore.uris" = "thrift://hive-metastore:9083",
    "aws.s3.use_instance_profile" = "false",
    "aws.s3.access_key" = "admin",
    "aws.s3.secret_key" = "password",
    "aws.s3.region" = "us-east-1",
    "aws.s3.enable_ssl" = "false",
    "aws.s3.enable_path_style_access" = "true",
    "aws.s3.endpoint" = "http://minio:9000"
);
set catalog hudi_catalog_hms;
show databases;
use hudi_onetable;
show tables;

output

StarRocks > CREATE EXTERNAL CATALOG hudi_catalog_hms
    -> PROPERTIES
    -> (
    ->     "type" = "hudi",
    ->     "hive.metastore.type" = "hive",
    ->     "hive.metastore.uris" = "thrift://hive-metastore:9083",
    ->     "aws.s3.use_instance_profile" = "false",
    ->     "aws.s3.access_key" = "admin",
    ->     "aws.s3.secret_key" = "password",
    ->     "aws.s3.region" = "us-east-1",
    ->     "aws.s3.enable_ssl" = "false",
    ->     "aws.s3.enable_path_style_access" = "true",
    ->     "aws.s3.endpoint" = "http://minio:9000"
    -> );
Query OK, 0 rows affected (0.44 sec)

StarRocks > set catalog hudi_catalog_hms;
Query OK, 0 rows affected (0.00 sec)

StarRocks > show databases;
use hudi_onetable;
show tables;
+--------------------+
| Database           |
+--------------------+
| default            |
| hudi_onetable      |
| information_schema |
+--------------------+
3 rows in set (0.23 sec)

StarRocks > use hudi_onetable;
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A

Database changed
StarRocks > show tables;
+-------------------------+
| Tables_in_hudi_onetable |
+-------------------------+
| people                  |
+-------------------------+
1 row in set (0.00 sec)

StarRocks > select * from people;
+---------------------+------------------------+-----------------------+------------------------+-------------------------------------------------------------------------+------+---------+------+------+---------------------+
| _hoodie_commit_time | _hoodie_commit_seqno   | _hoodie_record_key    | _hoodie_partition_path | _hoodie_file_name                                                       | id   | name    | age  | city | create_ts           |
+---------------------+------------------------+-----------------------+------------------------+-------------------------------------------------------------------------+------+---------+------+------+---------------------+
| 20240222213015211   | 20240222213015211_8_12 | 20240222213015211_8_0 | city=DFW               | b503b61b-b5c9-437d-81a6-3732da898e27-0_8-42-0_20240222213015211.parquet |    6 | Charlie |   31 | DFW  | 2023-08-29 00:00:00 |
| 20240222213015211   | 20240222213015211_2_7  | 20240222213015211_2_0 | city=SFO               | 467cf3fa-18fc-4aa1-a20c-8581b4abd039-0_2-36-0_20240222213015211.parquet |    2 | Emily   |   30 | SFO  | 2023-09-28 00:00:00 |
| 20240222213015211   | 20240222213015211_7_9  | 20240222213015211_7_0 | city=SEA               | fd8ec934-da12-4cb0-9b19-9aaa524b3159-0_7-41-0_20240222213015211.parquet |    5 | Bob     |   28 | SEA  | 2023-09-23 00:00:00 |
| 20240222213015211   | 20240222213015211_4_8  | 20240222213015211_4_0 | city=ORD               | e5bb037a-8141-46d8-b3a8-d4211373d354-0_4-38-0_20240222213015211.parquet |    3 | Michael |   35 | ORD  | 2023-09-28 00:00:00 |
| 20240222213015211   | 20240222213015211_5_10 | 20240222213015211_5_0 | city=NYC               | a239d9f5-ceba-4ca5-ba83-51cea9a2731e-0_5-39-0_20240222213015211.parquet |    4 | Andrew  |   40 | NYC  | 2023-10-28 00:00:00 |
| 20240222213015211   | 20240222213015211_1_11 | 20240222213015211_1_0 | city=NYC               | 294108a6-8702-4bce-a704-0bcb6196e8bf-0_1-35-0_20240222213015211.parquet |    1 | John    |   25 | NYC  | 2023-09-28 00:00:00 |
+---------------------+------------------------+-----------------------+------------------------+-------------------------------------------------------------------------+------+---------+------+------+---------------------+
6 rows in set (5.44 sec)

StarRocks >

[!NOTE]
Issue with Minio authentication. https://github.com/onetable-io/onetable/issues/327. Updated: Fixed, waiting for PR to get into main.

[!IMPORTANT]
You have to compile the onetable code right now to get the 600+ meg utilities-0.1.0-SNAPSHOT-bundled.jar file. They're working on making it smaller but right now, there is no other option.

export AWS_ACCESS_KEY_ID=admin
export AWS_SECRET_ACCESS_KEY=password
cd /spark-3.2.1-bin-hadoop3.2/auxjars
java -jar utilities-0.1.0-SNAPSHOT-bundled.jar --datasetConfig onetable.yaml
spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:1.2.1 \
--conf "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" \
--conf "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog" \
--conf "spark.sql.catalog.spark_catalog.type=hive" \
--conf "spark.sql.catalog.hive_prod=org.apache.iceberg.spark.SparkCatalog" \
--conf "spark.sql.catalog.hive_prod.type=hive"
CREATE SCHEMA iceberg_db LOCATION 's3://warehouse/';

CALL hive_prod.system.register_table(
   table => 'hive_prod.iceberg_db.people',
   metadata_file => 's3://huditest/hudi-dataset/people/metadata/v2.metadata.json'
);
CREATE EXTERNAL CATALOG iceberg_catalog_hms
PROPERTIES
(
    "type" = "iceberg",
    "iceberg.catalog.type" = "hive",
    "hive.metastore.uris" = "thrift://hive-metastore:9083",
    "aws.s3.use_instance_profile" = "false",
    "aws.s3.access_key" = "admin",
    "aws.s3.secret_key" = "password",
    "aws.s3.region" = "us-east-1",
    "aws.s3.enable_ssl" = "false",
    "aws.s3.enable_path_style_access" = "true",
    "aws.s3.endpoint" = "http://minio:9000"
);
set catalog iceberg_catalog_hms;
show databases;
use iceberg_db;
show tables;

output

StarRocks > CREATE EXTERNAL CATALOG iceberg_catalog_hms
    -> PROPERTIES
    -> (
    ->     "type" = "iceberg",
    ->     "iceberg.catalog.type" = "hive",
    ->     "hive.metastore.uris" = "thrift://hive-metastore:9083",
    ->     "aws.s3.use_instance_profile" = "false",
    ->     "aws.s3.access_key" = "admin",
    ->     "aws.s3.secret_key" = "password",
    ->     "aws.s3.region" = "us-east-1",
    ->     "aws.s3.enable_ssl" = "false",
    ->     "aws.s3.enable_path_style_access" = "true",
    ->     "aws.s3.endpoint" = "http://minio:9000"
    -> );
Query OK, 0 rows affected (0.03 sec)

StarRocks > set catalog iceberg_catalog_hms;
Query OK, 0 rows affected (0.00 sec)

StarRocks > show databases;
+--------------------+
| Database           |
+--------------------+
| default            |
| hudi_onetable      |
| iceberg_db         |
| information_schema |
+--------------------+
4 rows in set (0.15 sec)

StarRocks > use iceberg_db;
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A

Database changed
StarRocks > show tables;
+----------------------+
| Tables_in_iceberg_db |
+----------------------+
| people               |
+----------------------+
1 row in set (0.04 sec)

StarRocks > select * from people;
+---------------------+------------------------+-----------------------+------------------------+-------------------------------------------------------------------------+------+---------+------+------+---------------------+
| _hoodie_commit_time | _hoodie_commit_seqno   | _hoodie_record_key    | _hoodie_partition_path | _hoodie_file_name                                                       | id   | name    | age  | city | create_ts           |
+---------------------+------------------------+-----------------------+------------------------+-------------------------------------------------------------------------+------+---------+------+------+---------------------+
| 20240222213015211   | 20240222213015211_8_12 | 20240222213015211_8_0 | city=DFW               | b503b61b-b5c9-437d-81a6-3732da898e27-0_8-42-0_20240222213015211.parquet |    6 | Charlie |   31 | DFW  | 2023-08-29 00:00:00 |
| 20240222213015211   | 20240222213015211_4_8  | 20240222213015211_4_0 | city=ORD               | e5bb037a-8141-46d8-b3a8-d4211373d354-0_4-38-0_20240222213015211.parquet |    3 | Michael |   35 | ORD  | 2023-09-28 00:00:00 |
| 20240222213015211   | 20240222213015211_5_10 | 20240222213015211_5_0 | city=NYC               | a239d9f5-ceba-4ca5-ba83-51cea9a2731e-0_5-39-0_20240222213015211.parquet |    4 | Andrew  |   40 | NYC  | 2023-10-28 00:00:00 |
| 20240222213015211   | 20240222213015211_1_11 | 20240222213015211_1_0 | city=NYC               | 294108a6-8702-4bce-a704-0bcb6196e8bf-0_1-35-0_20240222213015211.parquet |    1 | John    |   25 | NYC  | 2023-09-28 00:00:00 |
| 20240222213015211   | 20240222213015211_2_7  | 20240222213015211_2_0 | city=SFO               | 467cf3fa-18fc-4aa1-a20c-8581b4abd039-0_2-36-0_20240222213015211.parquet |    2 | Emily   |   30 | SFO  | 2023-09-28 00:00:00 |
| 20240222213015211   | 20240222213015211_7_9  | 20240222213015211_7_0 | city=SEA               | fd8ec934-da12-4cb0-9b19-9aaa524b3159-0_7-41-0_20240222213015211.parquet |    5 | Bob     |   28 | SEA  | 2023-09-23 00:00:00 |
+---------------------+------------------------+-----------------------+------------------------+-------------------------------------------------------------------------+------+---------+------+------+---------------------+
6 rows in set (0.24 sec)
spark-sql --packages io.delta:delta-core_2.12:2.0.0 \
--conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" \
--conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" \
--conf "spark.sql.catalogImplementation=hive"
CREATE SCHEMA delta_db LOCATION 's3://warehouse/';

CREATE TABLE delta_db.people USING DELTA LOCATION 's3://huditest/hudi-dataset/people';
CREATE EXTERNAL CATALOG deltalake_catalog_hms
PROPERTIES
(
    "type" = "deltalake",
    "hive.metastore.type" = "hive",
    "hive.metastore.uris" = "thrift://hive-metastore:9083",
    "aws.s3.use_instance_profile" = "false",
    "aws.s3.access_key" = "admin",
    "aws.s3.secret_key" = "password",
    "aws.s3.region" = "us-east-1",
    "aws.s3.enable_ssl" = "false",
    "aws.s3.enable_path_style_access" = "true",
    "aws.s3.endpoint" = "http://minio:9000"
);
set catalog deltalake_catalog_hms;
show databases;
use delta_db;
show tables;

output

StarRocks > CREATE EXTERNAL CATALOG deltalake_catalog_hms
    -> PROPERTIES
    -> (
    ->     "type" = "deltalake",
    ->     "hive.metastore.type" = "hive",
    ->     "hive.metastore.uris" = "thrift://hive-metastore:9083",
    ->     "aws.s3.use_instance_profile" = "false",
    ->     "aws.s3.access_key" = "admin",
    ->     "aws.s3.secret_key" = "password",
    ->     "aws.s3.region" = "us-east-1",
    ->     "aws.s3.enable_ssl" = "false",
    ->     "aws.s3.enable_path_style_access" = "true",
    ->     "aws.s3.endpoint" = "http://minio:9000"
    -> );
Query OK, 0 rows affected (0.06 sec)

StarRocks > set catalog deltalake_catalog_hms;
Query OK, 0 rows affected (0.01 sec)

StarRocks > show databases;
+--------------------+
| Database           |
+--------------------+
| default            |
| delta_db           |
| hudi_onetable      |
| iceberg_db         |
| information_schema |
+--------------------+
5 rows in set (0.04 sec)

StarRocks > use delta_db;
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A

Database changed
StarRocks > show tables;
+--------------------+
| Tables_in_delta_db |
+--------------------+
| people             |
+--------------------+
1 row in set (0.08 sec)

StarRocks > select * from people;
+---------------------+------------------------+-----------------------+------------------------+-------------------------------------------------------------------------+------+---------+------+------+---------------------+
| _hoodie_commit_time | _hoodie_commit_seqno   | _hoodie_record_key    | _hoodie_partition_path | _hoodie_file_name                                                       | id   | name    | age  | city | create_ts           |
+---------------------+------------------------+-----------------------+------------------------+-------------------------------------------------------------------------+------+---------+------+------+---------------------+
| 20240222213015211   | 20240222213015211_4_8  | 20240222213015211_4_0 | city=ORD               | e5bb037a-8141-46d8-b3a8-d4211373d354-0_4-38-0_20240222213015211.parquet |    3 | Michael |   35 | ORD  | 2023-09-28 00:00:00 |
| 20240222213015211   | 20240222213015211_2_7  | 20240222213015211_2_0 | city=SFO               | 467cf3fa-18fc-4aa1-a20c-8581b4abd039-0_2-36-0_20240222213015211.parquet |    2 | Emily   |   30 | SFO  | 2023-09-28 00:00:00 |
| 20240222213015211   | 20240222213015211_8_12 | 20240222213015211_8_0 | city=DFW               | b503b61b-b5c9-437d-81a6-3732da898e27-0_8-42-0_20240222213015211.parquet |    6 | Charlie |   31 | DFW  | 2023-08-29 00:00:00 |
| 20240222213015211   | 20240222213015211_7_9  | 20240222213015211_7_0 | city=SEA               | fd8ec934-da12-4cb0-9b19-9aaa524b3159-0_7-41-0_20240222213015211.parquet |    5 | Bob     |   28 | SEA  | 2023-09-23 00:00:00 |
| 20240222213015211   | 20240222213015211_1_11 | 20240222213015211_1_0 | city=NYC               | 294108a6-8702-4bce-a704-0bcb6196e8bf-0_1-35-0_20240222213015211.parquet |    1 | John    |   25 | NYC  | 2023-09-28 00:00:00 |
| 20240222213015211   | 20240222213015211_5_10 | 20240222213015211_5_0 | city=NYC               | a239d9f5-ceba-4ca5-ba83-51cea9a2731e-0_5-39-0_20240222213015211.parquet |    4 | Andrew  |   40 | NYC  | 2023-10-28 00:00:00 |
+---------------------+------------------------+-----------------------+------------------------+-------------------------------------------------------------------------+------+---------+------+------+---------------------+
6 rows in set (0.11 sec)

StarRocks >
alberttwong commented 8 months ago
CREATE SCHEMA iceberg_db LOCATION 's3://warehouse/';

CALL hive_prod.system.register_table(
   table => 'hive_prod.iceberg_db.user_behavior',
   metadata_file => 's3://huditest/hudi_ecommerce_user_behavior/metadata/v2.metadata.json'
);

CALL hive_prod.system.register_table(
   table => 'hive_prod.iceberg_db.item',
   metadata_file => 's3://huditest/hudi_ecommerce_item/metadata/v2.metadata.json'
);
CREATE SCHEMA delta_db LOCATION 's3://warehouse/';

CREATE TABLE delta_db.user_behavior USING DELTA LOCATION 's3://huditest/hudi_ecommerce_user_behavior';

CREATE TABLE delta_db.item USING DELTA LOCATION 's3://huditest/hudi_ecommerce_item';