ETA444 / datasafari

DataSafari simplifies complex data science tasks into straightforward, powerful one-liners.
https://datasafari.dev
GNU General Public License v3.0
2 stars 0 forks source link

Implement error handling for transform_num() #43

Closed ETA444 closed 6 months ago

ETA444 commented 9 months ago

Implement error handling for each user input of the function.

ETA444 commented 6 months ago

TypeErrors: Checking Parameter Types

1. df Parameter

if not isinstance(df, pd.DataFrame):
    raise TypeError("transform_num(): The 'df' parameter must be a pandas DataFrame.")

2. numerical_variables Parameter

if not isinstance(numerical_variables, list):
    raise TypeError("transform_num(): The 'numerical_variables' parameter must be a list of column names.")
else:
    if not all(isinstance(var, str) for var in numerical_variables):
        raise TypeError("transform_num(): All elements in the 'numerical_variables' list must be strings representing column names.")

3. method Parameter

if not isinstance(method, str):
    raise TypeError("transform_num(): The 'method' parameter must be a string.")

4. output_distribution Parameter

if not isinstance(output_distribution, str):
    raise TypeError("transform_num(): The 'output_distribution' parameter must be a string.")

5. n_quantiles Parameter

if not isinstance(n_quantiles, int):
    raise TypeError("transform_num(): The 'n_quantiles' parameter must be an integer.")

6. random_state Parameter

if not isinstance(random_state, int):
    raise TypeError("transform_num(): The 'random_state' parameter must be an integer.")

7. with_centering Parameter

if not isinstance(with_centering, bool):
    raise TypeError("transform_num(): The 'with_centering' parameter must be a boolean.")

8. quantile_range Parameter

if not (isinstance(quantile_range, tuple) and len(quantile_range) == 2 and all(isinstance(num, float) for num in quantile_range)):
    raise TypeError("transform_num(): The 'quantile_range' parameter must be a tuple containing two float values.")

9. power Parameter

if power is not None and not isinstance(power, (int, float)):
    raise TypeError("transform_num(): The 'power' parameter must be a float, integer, or None.")

10. power_map Parameter

if power_map is not None and not isinstance(power_map, dict):
    raise TypeError("transform_num(): The 'power_map' parameter must be a dictionary or None.")

11. lower_percentile and upper_percentile Parameters

if not isinstance(lower_percentile, float) or not isinstance(upper_percentile, float):
    raise TypeError("transform_num(): The 'lower_percentile' and 'upper_percentile' parameters must be floats.")

12. winsorization_map Parameter

if winsorization_map is not None and not isinstance(winsorization_map, dict):
    raise TypeError("transform_num(): The 'winsorization_map' parameter must be a dictionary or None.")

13. interaction_pairs Parameter

if interaction_pairs is not None:
    if not (isinstance(interaction_pairs, list) and all(isinstance(pair, tuple) and len(pair) == 2 for pair in interaction_pairs)):
        raise TypeError("transform_num(): The 'interaction_pairs' parameter must be a list of tuples or None.")

14. degree Parameter

if degree is not None and not isinstance(degree, int):
    raise TypeError("transform_num(): The 'degree' parameter must be an integer or None.")

15. degree_map Parameter

if degree_map is not None and not isinstance(degree_map, dict):
    raise TypeError("transform_num(): The 'degree_map' parameter must be a dictionary or None.")

16. bins Parameter

if bins is not None and not isinstance(bins, int):
    raise TypeError("transform_num(): The 'bins' parameter must be an integer or None.")

17. bin_map Parameter

if bin_map is not None and not isinstance(bin_map, dict):
    raise TypeError("transform_num(): The 'bin_map' parameter must be a dictionary or None.")

ValueErrors: Checking Parameter Values

1. Empty DataFrame

if df.empty:
    raise ValueError("transform_num(): The input DataFrame is empty.")

2. Empty numerical_variables List

if len(numerical_variables) == 0:
    raise ValueError("transform_num(): The 'numerical_variables' list must contain at least one column name.")

3. Non-Numerical Variables

numerical_types = evaluate_dtype(df, numerical_variables, output='list_n')
if not all(numerical_types):
    raise ValueError(f"transform_num(): The 'numerical_variables' list must contain only names of numerical variables.")

4. Variables Not in DataFrame

missing_vars = [var for var in numerical_variables if var not in df.columns]
if missing_vars:
    raise ValueError(f"transform_num(): The following numerical variables were not found in the DataFrame: {', '.join(missing_vars)}")

5. Invalid Method

valid_methods = ['standardize', 'log', 'normalize', 'quantile', 'robust', 'boxcox', 'yeojohnson', 'power', 'winsorization', 'interaction', 'polynomial', 'bin']
if method.lower() not in valid_methods:
    raise ValueError(f"transform_num(): Invalid method '{method}'. Valid options are: {', '.join(valid_methods)}")

6. Invalid output_distribution for quantile Method

if method.lower() == 'quantile':
    if output_distribution not in ['normal', 'uniform']:
        raise ValueError("transform_num(): Invalid 'output_distribution' for 'quantile' method. Choose 'normal' or 'uniform'.")
    if not isinstance(n_quantiles, int) or n_quantiles <= 0:
        raise ValueError("transform_num(): The 'n_quantiles' must be a positive integer.")
    if not isinstance(random_state, int):
        raise ValueError("transform_num(): The 'random_state' must be an integer.")

7. Invalid quantile_range for robust Method

if method.lower() == 'robust':
    if not isinstance(with_centering, bool):
        raise ValueError("transform_num(): The 'with_centering' parameter must be a boolean (True or False).")
    if not (isinstance(quantile_range, tuple) and len(quantile_range) == 2 and all(isinstance(num, float) for num in quantile_range)):
        raise ValueError("transform_num(): The 'quantile_range' must be a tuple of two float values.")

8. Invalid power or power_map for power Method

if method.lower() == 'power':
    if power is not None and not isinstance(power, (float, int)):
        raise ValueError("transform_num(): The 'power' parameter

 must be a float, integer, or None.")
    if power_map is not None and not isinstance(power_map, dict):
        raise ValueError("transform_num(): The 'power_map' must be a dictionary mapping variables to powers or None.")

9. Invalid Percentiles for winsorization Method

if method.lower() == 'winsorization':
    if not (isinstance(lower_percentile, float) and 0 <= lower_percentile < 1):
        raise ValueError("transform_num(): The 'lower_percentile' must be a float between 0 and 1.")
    if not (isinstance(upper_percentile, float) and 0 < upper_percentile <= 1):
        raise ValueError("transform_num(): The 'upper_percentile' must be a float between 0 and 1.")
    if lower_percentile >= upper_percentile:
        raise ValueError("transform_num(): The 'lower_percentile' must be less than 'upper_percentile'.")

10. Invalid degree or degree_map for polynomial Method

if method.lower() == 'polynomial':
    if degree is not None and not (isinstance(degree, int) and degree > 0):
        raise ValueError("transform_num(): The 'degree' must be a positive integer or None.")
    if degree_map is not None and not isinstance(degree_map, dict):
        raise ValueError("transform_num(): The 'degree_map' must be a dictionary mapping variables to degrees or None.")

11. Invalid bins or bin_map for bin Method

if method.lower() == 'bin':
    if bins is not None and not (isinstance(bins, int) and bins > 0):
        raise ValueError("transform_num(): The 'bins' must be a positive integer or None.")
    if bin_map is not None and not isinstance(bin_map, dict):
        raise ValueError("transform_num(): The 'bin_map' must be a dictionary specifying binning criteria or None.")

12. Invalid interaction_pairs

if method.lower() == 'interaction':
    if interaction_pairs is not None:
        if not (isinstance(interaction_pairs, list) and all(isinstance(pair, tuple) and len(pair) == 2 for pair in interaction_pairs)):
            raise ValueError("transform_num(): The 'interaction_pairs' must be a list of tuples specifying pairs of variables or None.")
        missing_pairs = [pair for pair in interaction_pairs if pair[0] not in df.columns or pair[1] not in df.columns]
        if missing_pairs:
            raise ValueError(f"transform_num(): The following variable pairs in 'interaction_pairs' were not found in the DataFrame: {missing_pairs}")

13. Handling NaN or Infinite Values

if method.lower() in ['log', 'boxcox', 'yeojohnson']:
    if df[numerical_variables].isnull().values.any() or np.isinf(df[numerical_variables].values).any():
        raise ValueError(f"transform_num(): The 'numerical_variables' contain NaN or infinite values, which are not compatible with the '{method}' method.")

14. Mapping Dictionary Keys

if power_map or winsorization_map or degree_map or bin_map:
    for mapping, map_name in zip([power_map, winsorization_map, degree_map, bin_map], ['power_map', 'winsorization_map', 'degree_map', 'bin_map']):
        if mapping:
            invalid_keys = [key for key in mapping.keys() if key not in df.columns]
            if invalid_keys:
                raise ValueError(f"transform_num(): The following keys in '{map_name}' were not found in the DataFrame columns: {', '.join(invalid_keys)}")

Full Code Implementation for transform_num()