When I am using pydeeque in glue job. after giving results. the job keeps running.
code:
import sys
import os
from awsglue.transforms import
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import SparkSession, Row
import pydeequ
from pydeequ.checks import
from pydeequ.verification import *
Team,
When I am using pydeeque in glue job. after giving results. the job keeps running.
code:
import sys import os from awsglue.transforms import from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql import SparkSession, Row import pydeequ from pydeequ.checks import from pydeequ.verification import *
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args)
df = spark.sparkContext.parallelize([ Row(a="foo", b=1, c=5), Row(a="bar", b=2, c=6), Row(a="baz", b=3, c=None)]).toDF()
check = Check(spark, CheckLevel.Warning, "Review Check")
checkResult = VerificationSuite(spark) \ .onData(df) \ .addCheck( check.hasSize(lambda x: x >= 3) \ .hasMin("b", lambda x: x == 0) \ .isComplete("c") \ .isUnique("a") \ .isContainedIn("a", ["foo", "bar", "baz"]) \ .isNonNegative("b")) \ .run()
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult) checkResult_df.show()
job.commit()
Logs: