Axel-Bravo / 18_project_auto-ml

Automatic: Machine Learning project
GNU General Public License v3.0
2 stars 0 forks source link

Add auto_eda #11

Open Axel-Bravo opened 6 years ago

Axel-Bravo commented 6 years ago

Add auto_eda class

This clas will group all EDA techniques, so as to be more efficient in the dataset preparation.

include:

``import pandas as pd

class EDA(object): """ The scope of this class is to ensemble several useful methods for EDA on machine learning projects, in order to be able to import and use them in a more conveniente way. """

def __init__(self):
    pass

@staticmethod
def describe_dataframe(df=pd.DataFrame()):
    """This function generates descriptive stats of a dataframe
    Args:
        df (dataframe): the dataframe to be analyzed
    Returns:
        None

    """
    print("\n\n")
    print("*" * 30)
    print("About the Data")
    print("*" * 30)

    print("Number of rows::", df.shape[0])
    print("Number of columns::", df.shape[1])
    print("\n")

    print("Column Names::", df.columns.values.tolist())
    print("\n")

    print("Column Data Types::\n", df.dtypes)
    print("\n")

    print("Columns with Missing Values::", df.columns[df.isnull().any()].tolist())
    print("\n")

    print("Number of rows with Missing Values::", len(pd.isnull(df).any(1).nonzero()[0].tolist()))
    print("\n")

    print("Sample Indices with missing data::", pd.isnull(df).any(1).nonzero()[0].tolist()[0:5])
    print("\n")

    print("General Stats::")
    print(df.info())
    print("\n")

    print("Summary Stats::")
    print(df.describe())
    print("\n")

    print("Dataframe Sample Rows::")
    display(df.head(5))

@staticmethod
def cleanup_column_names(df, rename_dict={}, do_inplace=True):
    """This function renames columns of a pandas dataframe
       It converts column names to snake case if rename_dict is not passed. 
    Args:
        rename_dict (dict): keys represent old column names and values point to 
                            newer ones
        do_inplace (bool): flag to update existing dataframe or return a new one
    Returns:
        pandas dataframe if do_inplace is set to False, None otherwise

    """
    if not rename_dict:
        return df.rename(columns={col: col.lower().replace(' ', '_')
                                  for col in df.columns.values.tolist()},
                         inplace=do_inplace)
    else:
        return df.rename(columns=rename_dict, inplace=do_inplace)

``