Describe the bug
We are trying to run pydeeque on an already running EMR cluster using ' spark-submit --deploy-mode cluster --py-files s3://path/pydeeque.zip --jars s3://path/deequ-2.0.1-spark-3.2.jar python_file.py ' However we land into following exception related to pandas
"""
Traceback (most recent call last):
File "python_file.py", line 5, in
from pydeequ.verification import *
File "/mnt/yarn/usercache/hadoop/appcache/application_1674190090846_0001/container_1674190090846_0001_02_000001/pydeequ.zip/pydeequ/init.py", line 19, in
File "/mnt/yarn/usercache/hadoop/appcache/application_1674190090846_0001/container_1674190090846_0001_02_000001/pydeequ.zip/pydeequ/analyzers.py", line 9, in
File "/mnt/yarn/usercache/hadoop/appcache/application_1674190090846_0001/container_1674190090846_0001_02_000001/pydeequ.zip/pydeequ/pandas_utils.py", line 8, in
ModuleNotFoundError: No module named 'pandas'
"""
We also tried to zip and supply pandas with ' spark-submit --deploy-mode cluster --py-files s3://path/pydeeque.zip,s3://path/pandas.zip --jars s3://path/deequ-2.0.1-spark-3.2.jar python_file.py ' but landed into following issues
''''
Traceback (most recent call last):
File "python_file.py", line 5, in
from pydeequ.verification import *
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pydeequ/init.py", line 19, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pydeequ/analyzers.py", line 9, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pydeequ/pandas_utils.py", line 8, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/init.py", line 22, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/compat/init.py", line 15, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/compat/numpy/init.py", line 7, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/util/init.py", line 1, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/util/_decorators.py", line 14, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/_libs/init.py", line 13, in
ModuleNotFoundError: No module named 'pandas._libs.interval'
@prashant6700 I would suggest you try to get a simpler program who import Pandas working on EMR. It seems to be an environment setup issue not very relevant to PyDeequ
Describe the bug We are trying to run pydeeque on an already running EMR cluster using ' spark-submit --deploy-mode cluster --py-files s3://path/pydeeque.zip --jars s3://path/deequ-2.0.1-spark-3.2.jar python_file.py ' However we land into following exception related to pandas
""" Traceback (most recent call last): File "python_file.py", line 5, in
from pydeequ.verification import *
File "/mnt/yarn/usercache/hadoop/appcache/application_1674190090846_0001/container_1674190090846_0001_02_000001/pydeequ.zip/pydeequ/init.py", line 19, in
File "/mnt/yarn/usercache/hadoop/appcache/application_1674190090846_0001/container_1674190090846_0001_02_000001/pydeequ.zip/pydeequ/analyzers.py", line 9, in
File "/mnt/yarn/usercache/hadoop/appcache/application_1674190090846_0001/container_1674190090846_0001_02_000001/pydeequ.zip/pydeequ/pandas_utils.py", line 8, in
ModuleNotFoundError: No module named 'pandas'
"""
We also tried to zip and supply pandas with ' spark-submit --deploy-mode cluster --py-files s3://path/pydeeque.zip,s3://path/pandas.zip --jars s3://path/deequ-2.0.1-spark-3.2.jar python_file.py ' but landed into following issues
'''' Traceback (most recent call last): File "python_file.py", line 5, in
from pydeequ.verification import *
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pydeequ/init.py", line 19, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pydeequ/analyzers.py", line 9, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pydeequ/pandas_utils.py", line 8, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/init.py", line 22, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/compat/init.py", line 15, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/compat/numpy/init.py", line 7, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/util/init.py", line 1, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/util/_decorators.py", line 14, in
File "/mnt1/yarn/usercache/hadoop/appcache/application_1674190090846_0002/container_1674190090846_0002_01_000001/pandas.zip/pandas/_libs/init.py", line 13, in
ModuleNotFoundError: No module named 'pandas._libs.interval'
'''
To Reproduce
EMR configurations: -Master: m5.xlarge, 1 instance -core:m5.xlarge, 1 instance