business-science / pytimetk

Time series easier, faster, more fun. Pytimetk.
https://business-science.github.io/pytimetk/
MIT License
683 stars 58 forks source link

Correlation Funnel #277

Closed mdancho84 closed 9 months ago

mdancho84 commented 10 months ago

Starter code from Jared and Alex


import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(0)

# Define the number of rows for your DataFrame
num_rows = 200

# Create fake data for the columns
data = {
    'Age': np.random.randint(18, 65, size=num_rows),
    'Gender': np.random.choice(['Male', 'Female'], size=num_rows),
    'Marital_Status': np.random.choice(['Single', 'Married', 'Divorced'], size=num_rows),
    'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami'], size=num_rows),
    'Years_Playing': np.random.randint(0, 30, size=num_rows),
    'Average_Income': np.random.randint(20000, 100000, size=num_rows),
    'Member_Status': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], size=num_rows),
    'Number_Children': np.random.randint(0, 5, size=num_rows),
    'Own_House_Flag': np.random.choice([True, False], size=num_rows),
    'Own_Car_Count': np.random.randint(0, 3, size=num_rows),
    'PersonId': range(1, num_rows + 1),  # Add a PersonId column as a row count
    'Client': np.random.choice(['A', 'B'], size=num_rows)  # Add a Client column with random values 'A' or 'B'
}

# Create a DataFrame
df = pd.DataFrame(data)

def correlate(data, target, method='pearson'):
    if not isinstance(data, pd.DataFrame):
        raise ValueError("Error correlate(): Object is not of class `pd.DataFrame`.")

    if target not in data.columns:
        raise ValueError(f"Error in correlate(): '{target}' not found in the DataFrame columns.")

    if method not in ['pearson', 'kendall', 'spearman']:
        raise ValueError("Invalid correlation method. Choose from 'pearson', 'kendall', or 'spearman'.")

    # Calculate the correlation
    correlations = data.corrwith(data[target], method=method)
    correlations = correlations.reset_index()
    correlations.columns = ['feature', 'correlation']

    # Sort by absolute correlation in descending order
    correlations = correlations.sort_values(by='correlation', key=abs, ascending=False)

    return correlations

def binarize(data, n_bins=4, thresh_infreq=0.01, name_infreq="-OTHER", one_hot=True):

    if not isinstance(data, pd.DataFrame):
        raise ValueError("Error binarize(): Object is not of class `pd.DataFrame`.")

    # Get a list of columns with non-numeric and non-boolean data types
    non_numeric_columns = data.select_dtypes(exclude=['number', 'bool']).columns.tolist()

    # Convert non-numeric columns to categorical
    data[non_numeric_columns] = data[non_numeric_columns].astype('object')

    # The below part is me trying to fix the datatypes :(
    # Replace boolean columns with integers (0 and 1)
    for col in data.columns:
        if data[col].dtype == bool:
            data[col] = data[col].astype(int)

    # CHECKS ----
    # Check data types
    classes_not_allowed = ['datetime64', 'timedelta[ns]', 'complex64', 'complex128']
    check_data_type(data, classes_not_allowed, "binarize")

    # Check for missing values
    check_missing(data, "binarize")

    # FIXES ----
    data = logical_to_integer(data)

    # NON-BINARY DATA ----
    if len(data.select_dtypes(include=['number']).columns) > 0:
        data = fix_low_cardinality_numeric(data, thresh=n_bins + 3)

        # Check & fix skewed data
        data = fix_high_skew_numeric_data(data, unique_limit=2)

        # TRANSFORMATION STEPS ----
        data_transformed = create_recipe(data, n_bins, thresh_infreq, name_infreq, one_hot)

    return data_transformed

def check_data_type(data, classes_not_allowed, fun_name=None):
    invalid_cols = [col for col in data.columns if str(data[col].dtype) in classes_not_allowed]
    print(invalid_cols)
    if invalid_cols:
        msg = f"Error {fun_name}(): The following columns have invalid data types: {', '.join(invalid_cols)}"
        raise ValueError(msg)

def check_missing(data, fun_name=None):
    missing_cols = data.columns[data.isnull().any()]
    if not missing_cols.empty:
        msg = f"Error {fun_name}(): The following columns contain missing values: {', '.join(missing_cols)}"
        raise ValueError(msg)

def fix_low_cardinality_numeric(data, thresh):
    # Converts numeric columns with number of unique values <= thresh to categorical
    num_cols = data.select_dtypes(include=['number']).columns
    for col in num_cols:
        if len(data[col].unique()) <= thresh:
            data[col] = data[col].astype('category')
    return data

def fix_high_skew_numeric_data(data, unique_limit):
    # Converts numeric columns with number of unique quantile values <= limit to categorical 
    numeric_cols = data.select_dtypes(include=['number']).columns
    for col in numeric_cols:
        quantiles = np.quantile(data[col], [0, 0.2, 0.4, 0.6, 0.8, 1.0])
        if len(np.unique(quantiles)) <= unique_limit:
            data[col] = data[col].astype('category')
    return data

def create_recipe(data, n_bins, thresh_infreq, name_infreq, one_hot):
    # Recipe creation steps (similar to R code)
    num_count = len(data.select_dtypes(include=['number']).columns)
    cat_count = len(data.select_dtypes(include=['object', 'category']).columns)

    recipe = pd.DataFrame()

    if num_count > 0:
        # Convert continuous features to binned features
        for col in data.select_dtypes(include=['number']).columns:
            binned, bins = pd.cut(data[col], bins=n_bins, retbins=True, labels=False, right=False)
            bins=bins.tolist()
            one_hot_encoded = pd.get_dummies(binned)
            one_hot_encoded.columns = [f"{col}__{round(a,1)}_{round(b,1)}" for a, b in zip(bins[:-1], bins[1:])]
            data = pd.concat([data, one_hot_encoded], axis=1)
            data.drop(col, axis=1, inplace=True)

    if cat_count > 0:
        # Resolves error on thresh_infreq = 0
        if thresh_infreq == 0:
            thresh_infreq = 1e-9

        # Reduce cardinality of infrequent categorical levels
        for col in data.select_dtypes(include=['object', 'category']).columns:
            value_counts = data[col].value_counts(normalize=True)
            infrequent_values = value_counts[value_counts < thresh_infreq].index
            data[col].replace(infrequent_values, name_infreq, inplace=True)

        # Convert categorical and binned features to binary features (one-hot encoding)
        recipe = pd.get_dummies(data, prefix_sep='__')

    return recipe

def logical_to_integer(data):
    # Convert logical columns to integer
    logical_cols = data.select_dtypes(include=['bool']).columns
    data[logical_cols] = data[logical_cols].astype(int)
    return data

# Preprocess the data using the binarize function
# Replace n_bins, thresh_infreq, name_infreq, and one_hot with your desired values
target_column = 'Member_Status__Platinum'
processed_data = binarize(df, n_bins=5, thresh_infreq=0.01, name_infreq="-OTHER", one_hot=True)
correlation_results = correlate(data=processed_data, target=target_column, method='pearson')```
mdancho84 commented 10 months ago

Plot correlation funnel


import pandas as pd
import numpy as np
from plotnine import ggplot, aes, geom_vline, geom_point, geom_text, labs, theme_minimal, theme, element_text

def plot_correlation_funnel(data, interactive=False, limits=(-1, 1), alpha=1):
    if not isinstance(data, pd.DataFrame):
        raise ValueError("plot_correlation_funnel(): Object is not of class `pd.DataFrame`.")

    if interactive:
        data['label_text'] = data.apply(lambda row: f"{row['feature']}\nCorrelation: {row['correlation']:.3f}", axis=1)

        p = (
            ggplot(data)
            + aes(x='correlation', y='feature', text='label_text')
            + geom_vline(xintercept=0, linetype='dashed', color='red')
            + geom_point(color='#2c3e50', alpha=alpha)
            + labs(title='Correlation Funnel')
            + theme_minimal()
        )
        p = p + theme(axis_text_x=element_text(size=12))

        return p

    else:
        p = (
            ggplot(data)
            + aes(x='correlation', y='feature', label='feature')
            + geom_vline(xintercept=0, linetype='dashed', color='red')
            + geom_point(color='#2c3e50', alpha=alpha)
            + geom_text(size=12, color='#2c3e50')
            + labs(title='Correlation Funnel')
            + theme_minimal()
        )
        p = p + theme(axis_text_x=element_text(size=12))

        return p

# Example usage
#data = pd.read_csv('your_data.csv')  # Replace 'your_data.csv' with your dataset file
#interactive_plot = plot_correlation_funnel(data, interactive=True)
#print(interactive_plot)

# For a non-interactive plot
# plot_correlation_funnel(data, interactive=False).draw()
mdancho84 commented 10 months ago

Plotly


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

def plot_correlation_funnel(data, interactive=False, limits=(-1, 1), alpha=1):
    if not isinstance(data, pd.DataFrame):
        raise ValueError("plot_correlation_funnel(): Object is not of class `pd.DataFrame`.")

    if interactive:
        data['label_text'] = data.apply(lambda row: f"{row['feature']}\nCorrelation: {row['correlation']:.3f}", axis=1)

        fig = px.scatter(
            data, 
            x='correlation', 
            y='feature', 
            text='label_text',
            range_x=limits, 
            title='Correlation Funnel'
        )

        fig.update_traces(marker=dict(color='#2c3e50', opacity=alpha), selector=dict(mode='markers'))
        fig.update_layout(shapes=[dict(type='line', x0=0, x1=0, y0=0, y1=1, yref='paper', line=dict(color='red', dash='dash'))])
        fig.update_xaxes(title_text="Correlation")
        fig.update_yaxes(title_text="Feature")
        fig.update_layout(showlegend=False)

        return fig

    else:
        fig, ax = plt.subplots()
        ax.scatter(data['correlation'], data['feature'], c='#2c3e50', alpha=alpha)

        for i, row in data.iterrows():
            ax.text(row['correlation'], row['feature'], size=12, color='#2c3e50')

        ax.axvline(x=0, linestyle='--', color='red')
        ax.set_xlim(limits)
        ax.set_xlabel('Correlation')
        ax.set_ylabel('Feature')
        ax.set_title('Correlation Funnel')

        return plt.show()

# Example usage
#data = pd.read_csv('your_data.csv')  # Replace 'your_data.csv' with your dataset file
#interactive_plot = plot_correlation_funnel(data, interactive=True)
# To display the interactive plot, you can use interactive_plot.show()

# For a non-interactive plot
# plot_correlation_funnel(data, interactive=False)
mdancho84 commented 9 months ago

I have a basic example working.

   # NON-TIMESERIES EXAMPLE ----

    import pandas as pd
    import numpy as np
    import pytimetk as tk

    # Set a random seed for reproducibility
    np.random.seed(0)

    # Define the number of rows for your DataFrame
    num_rows = 200

    # Create fake data for the columns
    data = {
        'Age': np.random.randint(18, 65, size=num_rows),
        'Gender': np.random.choice(['Male', 'Female'], size=num_rows),
        'Marital_Status': np.random.choice(['Single', 'Married', 'Divorced'], size=num_rows),
        'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami'], size=num_rows),
        'Years_Playing': np.random.randint(0, 30, size=num_rows),
        'Average_Income': np.random.randint(20000, 100000, size=num_rows),
        'Member_Status': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], size=num_rows),
        'Number_Children': np.random.randint(0, 5, size=num_rows),
        'Own_House_Flag': np.random.choice([True, False], size=num_rows),
        'Own_Car_Count': np.random.randint(0, 3, size=num_rows),
        'PersonId': range(1, num_rows + 1),  # Add a PersonId column as a row count
        'Client': np.random.choice(['A', 'B'], size=num_rows)  # Add a Client column with random values 'A' or 'B'
    }

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Binarize the data
    df_binarized = df.binarize(n_bins=4, thresh_infreq=0.01, name_infreq="-OTHER", one_hot=True)

    df_binarized.glimpse()    
``` {python}
df_correlated = df_binarized.correlate(target='Member_Status__Platinum')
df_correlated.head(10)
```

``` {python}
# Interactive
df_correlated.plot_correlation_funnel(
    interactive=True, 
    height=600
)
```

``` {python}
# Static
df_correlated.plot_correlation_funnel(
    interactive=False, 
    height = 900
)


## Plotly 
![image](https://github.com/business-science/pytimetk/assets/13734662/756f4b10-962f-4c79-862e-ccb4d56d62af)

## Plotnine
For some reason the arrows are showing very think on the `plotnine` `adjust_text` integration. 
![image](https://github.com/business-science/pytimetk/assets/13734662/d33c8a6a-3155-4333-ba48-9588c8edfa52)