we are doing POC on creating pyspark job that can read data from S3 using glue table. I am using following code, and it is not reading data properly
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import col, size, desc
from pyspark.sql.functions import length
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
# Table has million records with almost 30 columns, since it is unstructured data not all the rows have the data in all the columns
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "db_name", table_name = "table_name", transformation_ctx = "datasource0")
#Only interested in 'categories' column, among million record only have 30 rows with not null values.
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("categories", "string", "categories", "string"), ("partition_0", "string", "partition_0", "string")], transformation_ctx = "applymapping1")
# Column count and row count is perfectly matching with athena query results
# select count(*) from table_name returning 1,000,000
# select count(*) from table_name length("categories") >5 returning 30
df = datasource0.toDF()
print((df.count(), len(df.columns)))
# expectincing 3 rows with valid categories but receiving null in all the rows.
# with out "ApplyMapping.apply" below line throws column not found exception
df.where(length(col("categories")) > 5) \
.select(col("categories")).show( 3, vertical = True)
we are doing POC on creating pyspark job that can read data from S3 using glue table. I am using following code, and it is not reading data properly