DataResponsibly / DataSynthesizer

MIT License
257 stars 85 forks source link

`bool` dtype not correctly supported #38

Open simone-mangiante opened 1 year ago

simone-mangiante commented 1 year ago

Description

DataDescriber does not handle well bool dtypes in the source dataset. When the CSV file has columns with only TRUE and FALSE as values, pandas reads such columns as bool dtype (not object) and, when inferring types, the code ends up in checking them as dates and fails.

What I Did

The source dataset is the telco-customer-churn dataset from Kaggle, after being imported in Google BigQuery and exported back to CSV, generating those TRUE and FALSE values instead of Yes and No. Below is my code:

from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network
import pandas as pd

# input dataset
input_data = "./out/from_bq.csv" # this CSV file has columns with TRUE and FALSE value which get read by pandas as bool dtype

mode = 'correlated_attribute_mode'

# location of two output files
description_file = f'./out/{mode}/description.json'
synthetic_data = f'./out/{mode}/synthetic_data.csv'

# An attribute is categorical if its domain size is less than this threshold.
threshold_value = 20

# list of dicsrete columns and primary key
categorical_columns = ["gender", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", "TotalCharges", "Churn"]
primary_key_column = "customerID"

# specify categorical attributes
categorical_attributes = {}
for column in categorical_columns:
    categorical_attributes[column] = True

# specify which attributes are candidate keys of input dataset.
candidate_keys = {primary_key_column: True}

# A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
# change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
# Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
epsilon = 0

# The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
degree_of_bayesian_network = 2

# Number of tuples generated in synthetic dataset.
num_tuples_to_generate = 1000

# build the Bayesian Network
describer = DataDescriber(category_threshold=threshold_value)
describer.describe_dataset_in_correlated_attribute_mode(dataset_file="./out/from_bq.csv", 
                                                        epsilon=epsilon, 
                                                        k=degree_of_bayesian_network,
                                                        attribute_to_is_categorical=categorical_attributes,
                                                        attribute_to_is_candidate_key=candidate_keys)

# save the output
describer.save_dataset_description_to_file(description_file)

Here is the output:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/tmp/ipykernel_4364/1321006366.py in <module>
     46                                                         k=degree_of_bayesian_network,
     47                                                         attribute_to_is_categorical=categorical_attributes,
---> 48                                                         attribute_to_is_candidate_key=candidate_keys)
     49 
     50 # save the output

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_correlated_attribute_mode(self, dataset_file, k, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
    170                                                             categorical_attribute_domain_file,
    171                                                             numerical_attribute_ranges,
--> 172                                                             seed)
    173         self.df_encoded = self.encode_dataset_into_binning_indices()
    174         if self.df_encoded.shape[1] < 2:

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_independent_attribute_mode(self, dataset_file, epsilon, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
    118                                              categorical_attribute_domain_file,
    119                                              numerical_attribute_ranges,
--> 120                                              seed=seed)
    121 
    122         for column in self.attr_to_column.values():

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in describe_dataset_in_random_mode(self, dataset_file, attribute_to_datatype, attribute_to_is_categorical, attribute_to_is_candidate_key, categorical_attribute_domain_file, numerical_attribute_ranges, seed)
     85         self.attr_to_is_candidate_key = attribute_to_is_candidate_key
     86         self.read_dataset_from_csv(dataset_file)
---> 87         self.infer_attribute_data_types()
     88         self.analyze_dataset_meta()
     89         self.represent_input_dataset_by_columns()

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/DataDescriber.py in infer_attribute_data_types(self)
    213                 # Sample 20 values to test its data_type.
    214                 samples = column_dropna.sample(20, replace=True)
--> 215                 if all(samples.map(is_datetime)):
    216                     self.attr_to_datatype[attr] = DataType.DATETIME
    217                 else:

/opt/conda/lib/python3.7/site-packages/pandas/core/series.py in map(self, arg, na_action)
   3980         dtype: object
   3981         """
-> 3982         new_values = super()._map_values(arg, na_action=na_action)
   3983         return self._constructor(new_values, index=self.index).__finalize__(
   3984             self, method="map"

/opt/conda/lib/python3.7/site-packages/pandas/core/base.py in _map_values(self, mapper, na_action)
   1158 
   1159         # mapper is a function
-> 1160         new_values = map_f(values, mapper)
   1161 
   1162         return new_values

pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()

/opt/conda/lib/python3.7/site-packages/DataSynthesizer/datatypes/DateTimeAttribute.py in is_datetime(value)
     19               'dec', 'december'}
     20 
---> 21     value_lower = value.lower()
     22     if (value_lower in weekdays) or (value_lower in months):
     23         return False

AttributeError: 'bool' object has no attribute 'lower'
haoyueping commented 1 year ago

Hi, @simone-mangiante, I could not replicate this error with either DataSynthesizer 0.1.10 or 0.1.11. I tested your script with the telco-customer-churn dataset from Kaggle, and Python 3.7, on Pop!_OS 22.04.

Please check out DataSynthesizer 0.1.12 as well.