Closed ETA444 closed 6 months ago
df
Parameterif not isinstance(df, pd.DataFrame):
raise TypeError("transform_num(): The 'df' parameter must be a pandas DataFrame.")
numerical_variables
Parameternumerical_variables
is a list of column names.if not isinstance(numerical_variables, list):
raise TypeError("transform_num(): The 'numerical_variables' parameter must be a list of column names.")
else:
if not all(isinstance(var, str) for var in numerical_variables):
raise TypeError("transform_num(): All elements in the 'numerical_variables' list must be strings representing column names.")
method
Parametermethod
is a string.if not isinstance(method, str):
raise TypeError("transform_num(): The 'method' parameter must be a string.")
output_distribution
Parameteroutput_distribution
is a string.if not isinstance(output_distribution, str):
raise TypeError("transform_num(): The 'output_distribution' parameter must be a string.")
n_quantiles
Parametern_quantiles
is an integer.if not isinstance(n_quantiles, int):
raise TypeError("transform_num(): The 'n_quantiles' parameter must be an integer.")
random_state
Parameterrandom_state
is an integer.if not isinstance(random_state, int):
raise TypeError("transform_num(): The 'random_state' parameter must be an integer.")
with_centering
Parameterwith_centering
is a boolean.if not isinstance(with_centering, bool):
raise TypeError("transform_num(): The 'with_centering' parameter must be a boolean.")
quantile_range
Parameterquantile_range
is a tuple containing two float values.if not (isinstance(quantile_range, tuple) and len(quantile_range) == 2 and all(isinstance(num, float) for num in quantile_range)):
raise TypeError("transform_num(): The 'quantile_range' parameter must be a tuple containing two float values.")
power
Parameterpower
is None
, an integer, or a float.if power is not None and not isinstance(power, (int, float)):
raise TypeError("transform_num(): The 'power' parameter must be a float, integer, or None.")
power_map
Parameterpower_map
is None
or a dictionary.if power_map is not None and not isinstance(power_map, dict):
raise TypeError("transform_num(): The 'power_map' parameter must be a dictionary or None.")
lower_percentile
and upper_percentile
Parameterslower_percentile
and upper_percentile
are floats.if not isinstance(lower_percentile, float) or not isinstance(upper_percentile, float):
raise TypeError("transform_num(): The 'lower_percentile' and 'upper_percentile' parameters must be floats.")
winsorization_map
Parameterwinsorization_map
is None
or a dictionary.if winsorization_map is not None and not isinstance(winsorization_map, dict):
raise TypeError("transform_num(): The 'winsorization_map' parameter must be a dictionary or None.")
interaction_pairs
Parameterinteraction_pairs
is None
or a list of tuples.if interaction_pairs is not None:
if not (isinstance(interaction_pairs, list) and all(isinstance(pair, tuple) and len(pair) == 2 for pair in interaction_pairs)):
raise TypeError("transform_num(): The 'interaction_pairs' parameter must be a list of tuples or None.")
degree
Parameterdegree
is None
or an integer.if degree is not None and not isinstance(degree, int):
raise TypeError("transform_num(): The 'degree' parameter must be an integer or None.")
degree_map
Parameterdegree_map
is None
or a dictionary.if degree_map is not None and not isinstance(degree_map, dict):
raise TypeError("transform_num(): The 'degree_map' parameter must be a dictionary or None.")
bins
Parameterbins
is None
or an integer.if bins is not None and not isinstance(bins, int):
raise TypeError("transform_num(): The 'bins' parameter must be an integer or None.")
bin_map
Parameterbin_map
is None
or a dictionary.if bin_map is not None and not isinstance(bin_map, dict):
raise TypeError("transform_num(): The 'bin_map' parameter must be a dictionary or None.")
if df.empty:
raise ValueError("transform_num(): The input DataFrame is empty.")
numerical_variables
Listnumerical_variables
contains at least one column name.if len(numerical_variables) == 0:
raise ValueError("transform_num(): The 'numerical_variables' list must contain at least one column name.")
numerical_types = evaluate_dtype(df, numerical_variables, output='list_n')
if not all(numerical_types):
raise ValueError(f"transform_num(): The 'numerical_variables' list must contain only names of numerical variables.")
missing_vars = [var for var in numerical_variables if var not in df.columns]
if missing_vars:
raise ValueError(f"transform_num(): The following numerical variables were not found in the DataFrame: {', '.join(missing_vars)}")
valid_methods = ['standardize', 'log', 'normalize', 'quantile', 'robust', 'boxcox', 'yeojohnson', 'power', 'winsorization', 'interaction', 'polynomial', 'bin']
if method.lower() not in valid_methods:
raise ValueError(f"transform_num(): Invalid method '{method}'. Valid options are: {', '.join(valid_methods)}")
output_distribution
for quantile
Methodoutput_distribution
is either 'normal' or 'uniform' for the quantile
method.if method.lower() == 'quantile':
if output_distribution not in ['normal', 'uniform']:
raise ValueError("transform_num(): Invalid 'output_distribution' for 'quantile' method. Choose 'normal' or 'uniform'.")
if not isinstance(n_quantiles, int) or n_quantiles <= 0:
raise ValueError("transform_num(): The 'n_quantiles' must be a positive integer.")
if not isinstance(random_state, int):
raise ValueError("transform_num(): The 'random_state' must be an integer.")
quantile_range
for robust
Methodquantile_range
is a tuple of two float values for the robust
method.if method.lower() == 'robust':
if not isinstance(with_centering, bool):
raise ValueError("transform_num(): The 'with_centering' parameter must be a boolean (True or False).")
if not (isinstance(quantile_range, tuple) and len(quantile_range) == 2 and all(isinstance(num, float) for num in quantile_range)):
raise ValueError("transform_num(): The 'quantile_range' must be a tuple of two float values.")
power
or power_map
for power
Methodpower
is a float, integer, or None
for the power
method, and power_map
is a dictionary or None
.if method.lower() == 'power':
if power is not None and not isinstance(power, (float, int)):
raise ValueError("transform_num(): The 'power' parameter
must be a float, integer, or None.")
if power_map is not None and not isinstance(power_map, dict):
raise ValueError("transform_num(): The 'power_map' must be a dictionary mapping variables to powers or None.")
winsorization
Methodlower_percentile
and upper_percentile
are valid for the winsorization
method.if method.lower() == 'winsorization':
if not (isinstance(lower_percentile, float) and 0 <= lower_percentile < 1):
raise ValueError("transform_num(): The 'lower_percentile' must be a float between 0 and 1.")
if not (isinstance(upper_percentile, float) and 0 < upper_percentile <= 1):
raise ValueError("transform_num(): The 'upper_percentile' must be a float between 0 and 1.")
if lower_percentile >= upper_percentile:
raise ValueError("transform_num(): The 'lower_percentile' must be less than 'upper_percentile'.")
degree
or degree_map
for polynomial
Methoddegree
is a positive integer or None
for the polynomial
method, and degree_map
is a dictionary or None
.if method.lower() == 'polynomial':
if degree is not None and not (isinstance(degree, int) and degree > 0):
raise ValueError("transform_num(): The 'degree' must be a positive integer or None.")
if degree_map is not None and not isinstance(degree_map, dict):
raise ValueError("transform_num(): The 'degree_map' must be a dictionary mapping variables to degrees or None.")
bins
or bin_map
for bin
Methodbins
is a positive integer or None
for the bin
method, and bin_map
is a dictionary or None
.if method.lower() == 'bin':
if bins is not None and not (isinstance(bins, int) and bins > 0):
raise ValueError("transform_num(): The 'bins' must be a positive integer or None.")
if bin_map is not None and not isinstance(bin_map, dict):
raise ValueError("transform_num(): The 'bin_map' must be a dictionary specifying binning criteria or None.")
interaction_pairs
interaction_pairs
is a list of tuples specifying pairs of variables or None
.if method.lower() == 'interaction':
if interaction_pairs is not None:
if not (isinstance(interaction_pairs, list) and all(isinstance(pair, tuple) and len(pair) == 2 for pair in interaction_pairs)):
raise ValueError("transform_num(): The 'interaction_pairs' must be a list of tuples specifying pairs of variables or None.")
missing_pairs = [pair for pair in interaction_pairs if pair[0] not in df.columns or pair[1] not in df.columns]
if missing_pairs:
raise ValueError(f"transform_num(): The following variable pairs in 'interaction_pairs' were not found in the DataFrame: {missing_pairs}")
if method.lower() in ['log', 'boxcox', 'yeojohnson']:
if df[numerical_variables].isnull().values.any() or np.isinf(df[numerical_variables].values).any():
raise ValueError(f"transform_num(): The 'numerical_variables' contain NaN or infinite values, which are not compatible with the '{method}' method.")
if power_map or winsorization_map or degree_map or bin_map:
for mapping, map_name in zip([power_map, winsorization_map, degree_map, bin_map], ['power_map', 'winsorization_map', 'degree_map', 'bin_map']):
if mapping:
invalid_keys = [key for key in mapping.keys() if key not in df.columns]
if invalid_keys:
raise ValueError(f"transform_num(): The following keys in '{map_name}' were not found in the DataFrame columns: {', '.join(invalid_keys)}")
Implement error handling for each user input of the function.