awslabs / aws-glue-libs

AWS Glue Libraries are additions and enhancements to Spark for ETL operations.
Other
635 stars 300 forks source link

Unable to get same data retuned by athena using pyspark gluecontext. #122

Open SARAVANA1501 opened 2 years ago

SARAVANA1501 commented 2 years ago

we are doing POC on creating pyspark job that can read data from S3 using glue table. I am using following code, and it is not reading data properly

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import col, size, desc
from pyspark.sql.functions import length

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Table has million records with almost 30 columns, since it is unstructured data not all the rows have the data in all the columns
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "db_name", table_name = "table_name", transformation_ctx = "datasource0")

#Only interested in 'categories' column, among million record only have 30 rows with not null values.

applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("categories", "string", "categories", "string"),  ("partition_0", "string", "partition_0", "string")], transformation_ctx = "applymapping1")

# Column count and row count is perfectly matching with athena query results
# select count(*) from table_name returning 1,000,000
# select count(*) from table_name length("categories") >5  returning 30
df = datasource0.toDF()
print((df.count(), len(df.columns)))

# expectincing 3 rows with valid categories but receiving null in all the rows.
# with out "ApplyMapping.apply" below line throws column not found exception
df.where(length(col("categories")) > 5) \
     .select(col("categories")).show( 3, vertical = True)