Closed soumilshah1995 closed 1 year ago
@soumilshah1995 I am able to successfully run clustering with your code. The third block for show clustering fails as expected as it tried to find the table name and we are passing the path.
Can you clarify more when are you seeing this error - java.util.NoSuchElementException: No value present in Option. I didn't hit this error with Glue 4.0.
Code - https://gist.github.com/ad1happy2go/4929bf4d4247c320431f147d9f1407f2
let me try today and get back to you :D
ALL SET
try:
import sys, os, uuid
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
from pyspark.sql.types import *
from faker import Faker
except Exception as e:
print("Modules are missing: {}".format(e))
# Get command-line arguments
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
# Create a Spark session and Glue context
spark = (SparkSession.builder.config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
.config('spark.sql.hive.convertMetastoreParquet', 'false') \
.config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.hudi.catalog.HoodieCatalog') \
.config('spark.sql.extensions', 'org.apache.spark.sql.hudi.HoodieSparkSessionExtension') \
.config('spark.sql.legacy.pathOptionBehavior.enabled', 'true').getOrCreate())
sc = spark.sparkContext
glueContext = GlueContext(sc)
job = Job(glueContext)
logger = glueContext.get_logger()
job.init(args['JOB_NAME'], args)
# =================================INSERTING DATA =====================================
global faker
faker = Faker()
class DataGenerator(object):
@staticmethod
def get_data():
return [
(
uuid.uuid4().__str__(),
faker.name(),
faker.random_element(elements=('IT', 'HR', 'Sales', 'Marketing')),
faker.random_element(elements=('CA', 'NY', 'TX', 'FL', 'IL', 'RJ')),
str(faker.random_int(min=10000, max=150000)),
str(faker.random_int(min=18, max=60)),
str(faker.random_int(min=0, max=100000)),
str(faker.unix_time()),
faker.email(),
faker.credit_card_number(card_type='amex'),
) for x in range(100)
]
data = DataGenerator.get_data()
columns = ["emp_id", "employee_name", "department", "state", "salary", "age", "bonus", "ts", "email", "credit_card"]
spark_df = spark.createDataFrame(data=data, schema=columns)
# ============================== Settings =======================================
db_name = "default"
table_name = "issue_8919"
recordkey = 'emp_id'
precombine = "ts"
PARTITION_FIELD = 'state'
path = "s3://soumilshah-hudi-demos/output/" + table_name
method = 'upsert'
table_type = "COPY_ON_WRITE"
# ====================================================================================
hudi_part_write_config = {
'className': 'org.apache.hudi',
"hoodie.schema.on.read.enable": "true",
"hoodie.datasource.write.reconcile.schema": "true",
"hoodie.avro.schema.external.transformation": "true",
'hoodie.avro.schema.validate': "true",
"hoodie.datasource.write.schema.allow.auto.evolution.column.drop": "true",
'hoodie.table.name': table_name,
'hoodie.datasource.write.table.type': table_type,
'hoodie.datasource.write.operation': method,
'hoodie.datasource.write.recordkey.field': recordkey,
'hoodie.datasource.write.precombine.field': precombine,
'hoodie.datasource.hive_sync.mode': 'hms',
'hoodie.datasource.hive_sync.use_jdbc': 'false',
'hoodie.datasource.hive_sync.enable': 'true'
}
spark_df.write.format("hudi").options(**hudi_part_write_config).mode("append").save(path)
query_show_commits = f"call show_commits('{db_name}.{table_name}', 5)"
spark_df_commits = spark.sql(query_show_commits)
commits = list(map(lambda row: row[0], spark_df_commits.collect()))
spark_df_commits.show()
try:
print("Trying clustering 1..")
query_show_clustering = f"call run_clustering('{db_name}.{table_name}')"
spark_df_clusterings = spark.sql(query_show_clustering)
spark_df_clusterings.show()
print(" clustering 1 complete ")
except Exception as e:
print("Error 1", e)
raise e
try:
print("Try show clustering 2")
query = f"call show_clustering('{db_name}.{table_name}')"
result_df = spark.sql(query)
result_df.show()
print("Complete show clustering 2 ")
except Exception as e:
print("Error show clustering 2", e)
raise e
<html>
<body>
<!--StartFragment-->
+-----------------+-------------------+-----------------+-------------------+------------------------+---------------------+----------------------------+------------+\| commit_time\|total_bytes_written\|total_files_added\|total_files_updated\|total_partitions_written\|total_records_written\|total_update_records_written\|total_errors\|+-----------------+-------------------+-----------------+-------------------+------------------------+---------------------+----------------------------+------------+\|20230704125519577\| 445969\| 1\| 0\| 1\| 100\| 0\| 0\|+-----------------+-------------------+-----------------+-------------------+------------------------+---------------------+----------------------------+------------+Trying clustering 1..
--
+-----------------+----------------+---------+-------------------+\| timestamp\|input_group_size\| state\|involved_partitions\|+-----------------+----------------+---------+-------------------+\|20230704125731917\| 1\|COMPLETED\| *\|+-----------------+----------------+---------+-------------------+ clustering 1 complete Try show clustering 2
+-----------------+----------------+---------+-------------------+\| timestamp\|input_group_size\| state\|involved_partitions\|+-----------------+----------------+---------+-------------------+\|20230704125731917\| 1\|COMPLETED\| *\|+-----------------+----------------+---------+-------------------+
<!--EndFragment-->
</body>
</html>
Hello Hope all well while i was playing with clustering i tried few ways to perform clustering using stored procedures. thought to create an issue so it can be tracked
Output
Output of show clustering