File ~/opt/anaconda3/envs/p39/lib/python3.9/site-packages/ydata_profiling/profile_report.py:560, in ProfileReport.compare(self, other, config)
544 """Compare this report with another ProfileReport
545 Alias for:
546 ```
(...)
556 Comparison ProfileReport
557 """
558 from ydata_profiling.compare_reports import compare
--> 560 return compare([self, other], config if config is not None else self.config)
File ~/opt/anaconda3/envs/p39/lib/python3.9/site-packages/ydata_profiling/compare_reports.py:302, in compare(reports, config, compute)
300 for report in reports[1:]:
301 cols_2_compare = [col for col in base_features if col in report.df.columns] # type: ignore
--> 302 report.df = report.df.loc[:, cols_2_compare] # type: ignore
303 reports = [r for r in reports if not r.df.empty] # type: ignore
304 if len(reports) == 1:
File ~/opt/anaconda3/envs/p39/lib/python3.9/site-packages/pyspark/sql/dataframe.py:1988, in DataFrame.getattr(self, name)
1978 """Returns the :class:Column denoted by name.
1979
1980 .. versionadded:: 1.3.0
(...)
1985 [Row(age=2), Row(age=5)]
1986 """
1987 if name not in self.columns:
-> 1988 raise AttributeError(
1989 "'%s' object has no attribute '%s'" % (self.class.name, name)
1990 )
1991 jc = self._jdf.apply(name)
1992 return Column(jc)
AttributeError: 'DataFrame' object has no attribute 'loc'
Expected Behaviour
I expect to be able to compare reports related to spark dataframes
Data Description
def generate_X(seed, n_features=7, n_samples=5000):
X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=seed)
X =pd.DataFrame(X, columns = [f"f_{col_nber}" for col_nber in range(1, 8)])
return X
Code that reproduces the bug
import pandas as pd
from pyspark.sql import SparkSession
from ydata_profiling import ProfileReport
from sklearn.datasets import make_classification
def generate_X(seed, n_features=7, n_samples=5000):
X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=seed)
X =pd.DataFrame(X, columns = [f"f_{col_nber}" for col_nber in range(1, 8)])
return X
train_X = generate_X(seed=1914)
test_X = generate_X(seed=1918)
spark = (
SparkSession
.builder
.appName('Test')
.getOrCreate())
spark_train = spark.createDataFrame(train_X)
spark_test = spark.createDataFrame(test_X)
spark_train_report = ProfileReport(spark_train,
title='Train Data Spark',
infer_dtypes=False,
interactions=None,
missing_diagrams=None,
correlations={"auto": {"calculate": False},
"pearson": {"calculate": False},
"spearman": {"calculate": False}}
)
spark_test_report = ProfileReport(spark_test,
title='Test Data Spark',
infer_dtypes=False,
interactions=None,
missing_diagrams=None,
correlations={"auto": {"calculate": False},
"pearson": {"calculate": False},
"spearman": {"calculate": False}}
)
spark_comparison_report = spark_train_report.compare(spark_test_report)
pandas-profiling version
4.6.4
Dependencies
pyspark==3.4.0
pandas==2.0.2
scikit-learn==1.2.2
Also failed with
pyspark==3.5.0
OS
Mac OS
Checklist
[X] There is not yet another bug report for this issue in the issue tracker
[X] The problem is reproducible from this bug report. This guide can help to craft a minimal bug report.
[X] The issue has not been resolved by the entries listed under Common Issues.
Current Behaviour
When running ydata profiling I can generate the report for Spark dataframe but the compare method gives an error.
AttributeError Traceback (most recent call last) Cell In[56], line 1 ----> 1 spark_comparison_report = spark_train_report.compare(spark_test_report)
File ~/opt/anaconda3/envs/p39/lib/python3.9/site-packages/ydata_profiling/profile_report.py:560, in ProfileReport.compare(self, other, config) 544 """Compare this report with another ProfileReport 545 Alias for: 546 ``` (...) 556 Comparison ProfileReport 557 """ 558 from ydata_profiling.compare_reports import compare --> 560 return compare([self, other], config if config is not None else self.config)
File ~/opt/anaconda3/envs/p39/lib/python3.9/site-packages/ydata_profiling/compare_reports.py:302, in compare(reports, config, compute) 300 for report in reports[1:]: 301 cols_2_compare = [col for col in base_features if col in report.df.columns] # type: ignore --> 302 report.df = report.df.loc[:, cols_2_compare] # type: ignore 303 reports = [r for r in reports if not r.df.empty] # type: ignore 304 if len(reports) == 1:
File ~/opt/anaconda3/envs/p39/lib/python3.9/site-packages/pyspark/sql/dataframe.py:1988, in DataFrame.getattr(self, name) 1978 """Returns the :class:
Column
denoted byname
. 1979 1980 .. versionadded:: 1.3.0 (...) 1985 [Row(age=2), Row(age=5)] 1986 """ 1987 if name not in self.columns: -> 1988 raise AttributeError( 1989 "'%s' object has no attribute '%s'" % (self.class.name, name) 1990 ) 1991 jc = self._jdf.apply(name) 1992 return Column(jc)AttributeError: 'DataFrame' object has no attribute 'loc'
Expected Behaviour
I expect to be able to compare reports related to spark dataframes
Data Description
Code that reproduces the bug
pandas-profiling version
4.6.4
Dependencies
OS
Mac OS
Checklist