awslabs / aws-glue-libs

AWS Glue Libraries are additions and enhancements to Spark for ETL operations.
635 stars 300 forks source link

Unable to get same data retuned by athena using pyspark gluecontext. #122

Open SARAVANA1501 opened 2 years ago

SARAVANA1501 commented 2 years ago

we are doing POC on creating pyspark job that can read data from S3 using glue table. I am using following code, and it is not reading data properly

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import col, size, desc
from pyspark.sql.functions import length

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Table has million records with almost 30 columns, since it is unstructured data not all the rows have the data in all the columns
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "db_name", table_name = "table_name", transformation_ctx = "datasource0")

#Only interested in 'categories' column, among million record only have 30 rows with not null values.

applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("categories", "string", "categories", "string"),  ("partition_0", "string", "partition_0", "string")], transformation_ctx = "applymapping1")

# Column count and row count is perfectly matching with athena query results
# select count(*) from table_name returning 1,000,000
# select count(*) from table_name length("categories") >5  returning 30
df = datasource0.toDF()
print((df.count(), len(df.columns)))

# expectincing 3 rows with valid categories but receiving null in all the rows.
# with out "ApplyMapping.apply" below line throws column not found exception
df.where(length(col("categories")) > 5) \
     .select(col("categories")).show( 3, vertical = True)