Open manishgaurav84 opened 1 year ago
@manishgaurav84 Can you post the command which you are using for metadata indexing?
I am trying the inline indexing mode, like we have for clustering or compaction. The above options are passed in df.write.options. I assume this will kick off data upsert and then indexing.
Got it, I will try to triage this. Thanks for quick response.
try these
try:
import sys
import os
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import col, to_timestamp, monotonically_increasing_id, to_date, when
from pyspark.sql.functions import *
from awsglue.utils import getResolvedOptions
from pyspark.sql.types import *
from datetime import datetime, date
import boto3
from functools import reduce
from pyspark.sql import Row
import uuid
from faker import Faker
except Exception as e:
print("Modules are missing : {} ".format(e))
spark = (SparkSession.builder.config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
.config('spark.sql.hive.convertMetastoreParquet', 'false') \
.config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.hudi.catalog.HoodieCatalog') \
.config('spark.sql.extensions', 'org.apache.spark.sql.hudi.HoodieSparkSessionExtension') \
.config('spark.sql.legacy.pathOptionBehavior.enabled', 'true').getOrCreate())
sc = spark.sparkContext
glueContext = GlueContext(sc)
job = Job(glueContext)
logger = glueContext.get_logger()
# =================================INSERTING DATA =====================================
global faker
faker = Faker()
class DataGenerator(object):
@staticmethod
def get_data():
return [
(
uuid.uuid4().__str__(),
faker.name(),
faker.random_element(elements=('IT', 'HR', 'Sales', 'Marketing')),
faker.random_element(elements=('CA', 'NY', 'TX', 'FL', 'IL', 'RJ')),
str(faker.random_int(min=10000, max=150000)),
str(faker.random_int(min=18, max=60)),
str(faker.random_int(min=0, max=100000)),
str(faker.unix_time()),
faker.email(),
faker.credit_card_number(card_type='amex'),
) for x in range(10000)
]
# ============================== Settings =======================================
db_name = "hudidb"
table_name = "employees"
recordkey = 'emp_id'
precombine = "ts"
PARTITION_FIELD = 'state'
path = "s3://XXXXXXXXXXX/hudi/"
method = 'upsert'
table_type = "MERGE_ON_READ"
# ====================================================================================
hudi_part_write_config = {
'className': 'org.apache.hudi',
'hoodie.table.name': table_name,
'hoodie.datasource.write.table.type': table_type,
'hoodie.datasource.write.operation': method,
'hoodie.datasource.write.recordkey.field': recordkey,
'hoodie.datasource.write.precombine.field': precombine,
"hoodie.schema.on.read.enable": "true",
"hoodie.datasource.write.reconcile.schema": "true",
'hoodie.datasource.hive_sync.mode': 'hms',
'hoodie.datasource.hive_sync.enable': 'true',
'hoodie.datasource.hive_sync.use_jdbc': 'false',
'hoodie.datasource.hive_sync.support_timestamp': 'false',
'hoodie.datasource.hive_sync.database': db_name,
'hoodie.datasource.hive_sync.table': table_name
, "hoodie.clean.automatic": "false"
, "hoodie.clean.async": "false"
, "hoodie.clustering.async.enabled": "true"
, "hoodie.metadata.enable": "true"
, "hoodie.metadata.index.async": "true"
, "hoodie.metadata.index.column.stats.enable": "true"
,"hoodie.metadata.index.check.timeout.seconds":"60"
,"hoodie.write.concurrency.mode":"optimistic_concurrency_control"
,"hoodie.write.lock.provider":"org.apache.hudi.client.transaction.lock.InProcessLockProvider"
}
for i in range(0, 5):
data = DataGenerator.get_data()
columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts", "email", "credit_card"]
spark_df = spark.createDataFrame(data=data, schema=columns)
spark_df.write.format("hudi").options(**hudi_part_write_config).mode("append").save(path)
@manishgaurav84 Did you got a chance to try out the code shared by @soumilshah1995 . Are you still facing this issue?
i had tested it should work :D
@manishgaurav84 Did it worked? Feel free to close this issue if you are good.
I am using the Glue 3.0 version to execute meta dat indexing for a hudi table.
non_global_bloom_indexing = { 'hoodie.table.name': tbl_name, 'hoodie.datasource.write.precombine.field': precombine, 'hoodie.datasource.write.recordkey.field': primary_key, 'hoodie.datasource.write.table.type': 'COPY_ON_WRITE', 'hoodie.datasource.write.hive_style_partitioning':'true', 'hoodie.datasource.write.operation': 'upsert', 'hoodie.datasource.hive_sync.enable': 'true', 'hoodie.datasource.hive_sync.table': tbl_name, 'hoodie.datasource.hive_sync.database': db_name, 'hoodie.datasource.hive_sync.partition_fields': partition_fields, 'hoodie.datasource.hive_sync.mode': 'hms', 'hoodie.datasource.hive_sync.partition_extractor_class':'org.apache.hudi.hive.HiveStylePartitionValueExtractor', 'hoodie.datasource.write.keygenerator.class':'org.apache.hudi.keygen.SimpleKeyGenerator', "hoodie.datasource.write.partitionpath.field": partition_fields, 'hoodie.upsert.shuffle.parallelism': 189, 'hoodie.metadata.enable' : 'true', "hoodie.metadata.index.bloom.filter.enable":'true', 'hoodie.metadata.index.column.stats.enable' :'true', 'hoodie.metadata.index.column.stats.column.list':partition_fields, 'hoodie.enable.data.skipping':'true', "hoodie.index.type":'BLOOM', "hoodie.bloom.index.use.metadata" : 'true', "hoodie.bloom.index.filter.type": "DYNAMIC_V0", "hoodie.bloom.index.use.caching": "true",
}
As expected inside | .hoodie/ | Folder | - | - | - below partitions are created. | bloom_filters/ | Folder | - | - | - | column_stats/ | Folder | - | - | - | files/ | Folder | - | - | -
But the timeline files .indexing.requested, indexing.inflight , indexing are not created,