raphaelvallat / pingouin

Statistical package in Python based on Pandas
https://pingouin-stats.org/
GNU General Public License v3.0
1.64k stars 139 forks source link

Partial Correlation gives unexpected output for toy example #435

Open PascalIversen opened 2 months ago

PascalIversen commented 2 months ago

Hi, thanks for this great library! I am getting perfect correlation for the following toy example. I expected ~zero correlation as in the regression approach.

import numpy as np
import pandas as pd
import statsmodels.api as sm
import pingouin as pg
from scipy.stats import pearsonr

n = 10000
y = list(range(1, n+1))
x = y + np.random.normal(size=n)*0.1
z = y 
df = pd.DataFrame({'x': x, 'y': y, 'z': z})
print(pg.partial_corr(data=df, x='x', y='y', covar=['z']))

# Regress x on z and u and get residuals
X_with_const = sm.add_constant(np.column_stack([z]))  # Add a constant and include both z and u
model_X = sm.OLS(x, X_with_const).fit()
residuals_X = model_X.resid

# Regress y on z and u and get residuals
model_Y = sm.OLS(y, X_with_const).fit()
residuals_Y = model_Y.resid

#  Compute correlation of residuals
residual_corr, p = pearsonr(residuals_X, residuals_Y)
print(f'Partial correlation using statsmodels: {residual_corr}, {p}')

Output:

             n    r       CI95%  p-val
pearson  10000  1.0  [1.0, 1.0]    0.0
Partial correlation using statsmodels: 0.0012024407422241278, 0.9043016773480718
 pingouin.__version__
'0.5.4'
PascalIversen commented 2 months ago

Here is another case with two categorical variables.

import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
from pingouin import partial_corr

# Step 1: Create synthetic data
n_samples = 1000  # Number of samples
n_categorical_var1 = 10  # Number of categories for variable 1
n_categorical_var2 = 3  # Number of categories for variable 2

# Random assignment of categories to samples
categorical_var1 = np.random.randint(0, n_categorical_var1, n_samples)
categorical_var2 = np.random.randint(0, n_categorical_var2, n_samples)

# True response values (random for this case)
true_response = np.random.normal(loc=categorical_var2+categorical_var1, scale=0.1, size=n_samples) 

# Create a DataFrame
df = pd.DataFrame({'categorical_variable1': categorical_var1, 'categorical_variable2': categorical_var2})
df['True_Response'] = true_response

# Calculate the mean response for each category as predicted values
cat_var1_mean_prediction = df.groupby('categorical_variable1')['True_Response'].transform('mean')
cat_var2_mean_prediction = df.groupby('categorical_variable2')['True_Response'].transform('mean')

# Use one of the mean predictions as the predicted response
df['Predicted_Response'] = cat_var1_mean_prediction 
print(df)
# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=["categorical_variable1", "categorical_variable2"], dtype=int)
df_encoded = df_encoded.drop(columns=["categorical_variable1_0", "categorical_variable2_0"])

# Step 3: Calculate partial correlation
# We regress True_Response and Predicted_Response on the covariates (categorical variables)
X_cov = sm.add_constant(df_encoded.drop(['True_Response', 'Predicted_Response'], axis=1))

# Regress true response on covariates
model_true = sm.OLS(df_encoded['True_Response'], X_cov).fit()
residuals_true = model_true.resid

# Regress predicted response (constant) on covariates
model_pred = sm.OLS(df_encoded['Predicted_Response'], X_cov).fit()
residuals_pred = model_pred.resid

# Compute partial correlation as the correlation of the residuals
partial_corr_residuals, _ = pearsonr(residuals_true, residuals_pred)

print(f'Partial Correlation between True Response and Predicted Response: {partial_corr_residuals:.4f}')

# Step 4: Use Pingouin to calculate partial correlation directly
covariates = list(df_encoded.drop(columns=["True_Response", "Predicted_Response"]).columns)
result = partial_corr(
    data=df_encoded,
    x="Predicted_Response",
    y="True_Response",
    covar=covariates,
    method="pearson",
)

# Extract the results from Pingouin
r = result["r"].iloc[0]
p = result["p-val"].iloc[0]
print(f"Partial Correlation (Pingouin): r={r:.4f}, p={p:.4e}")
Partial Correlation between True Response and Predicted Response:(LinReg): 0.0009
Partial Correlation (Pingouin): r=0.9986, p=0.0000e+00
raphaelvallat commented 2 months ago

Hi @PascalIversen,

Thanks for opening the issue. One approach that has been discussed in the past is to raise an error if the covariate(s) is identical to x or y: https://github.com/raphaelvallat/pingouin/issues/375

Alternatively, we could switch back to a regression-based approach in this particular corner case.

PascalIversen commented 2 months ago

Hey @raphaelvallat, thank you very much for your fast response.

In the second example the covariates are not identical to x or y. (The covariates are categorical.) I think that this is not a corner case, it came up during my analysis of a predictor which is solely based on knowledge about the identity of a category.

I can't resort to the regression as that would be too slow. I guess for now I'll be adding a negligible amount of noise to x (df_encoded['Predicted_Response'] = cat_var1_mean_prediction +np.random.normal(loc=0, scale=0.00001, size=n_samples)) which recovers a reasonable (insignificant) p value.

import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
from pingouin import partial_corr

# Step 1: Create synthetic data
n_samples = 1000  # Number of samples
n_categorical_var1 = 10  # Number of categories for variable 1
n_categorical_var2 = 3  # Number of categories for variable 2

# Random assignment of categories to samples
categorical_var1 = np.random.randint(0, n_categorical_var1, n_samples)
categorical_var2 = np.random.randint(0, n_categorical_var2, n_samples)

# True response values (random for this case)
true_response = np.random.normal(loc=categorical_var2+categorical_var1, scale=0.1, size=n_samples) 

# Create a DataFrame
df = pd.DataFrame({'categorical_variable1': categorical_var1, 'categorical_variable2': categorical_var2})
df['True_Response'] = true_response

# Calculate the mean response for each category as predicted values
cat_var1_mean_prediction = df.groupby('categorical_variable1')['True_Response'].transform('mean')
cat_var2_mean_prediction = df.groupby('categorical_variable2')['True_Response'].transform('mean')

# Use one of the mean predictions as the predicted response
df['Predicted_Response'] = cat_var1_mean_prediction 
# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=["categorical_variable1", "categorical_variable2"], dtype=int)
df_encoded = df_encoded.drop(columns=["categorical_variable1_0", "categorical_variable2_0"])

# Step 3: Calculate partial correlation
# We regress True_Response and Predicted_Response on the covariates (categorical variables)
X_cov = sm.add_constant(df_encoded.drop(['True_Response', 'Predicted_Response'], axis=1))

# Regress true response on covariates
model_true = sm.OLS(df_encoded['True_Response'], X_cov).fit()
residuals_true = model_true.resid

# Regress predicted response (constant) on covariates
model_pred = sm.OLS(df_encoded['Predicted_Response'], X_cov).fit()
residuals_pred = model_pred.resid

# Compute partial correlation as the correlation of the residuals
partial_corr_residuals, _ = pearsonr(residuals_true, residuals_pred)

print(f'Partial Correlation between True Response and Predicted Response: {partial_corr_residuals:.4f}')
df_encoded['Predicted_Response'] = cat_var1_mean_prediction +np.random.normal(loc=0, scale=0.00001, size=n_samples)

# Step 4: Use Pingouin to calculate partial correlation directly
covariates = list(df_encoded.drop(columns=["True_Response", "Predicted_Response"]).columns)
result = partial_corr(
    data=df_encoded,
    x="Predicted_Response",
    y="True_Response",
    covar=covariates,
    method="pearson",
)

# Extract the results from Pingouin
r = result["r"].iloc[0]
p = result["p-val"].iloc[0]
print(f"Partial Correlation (Pingouin): r={r:.4f}, p={p:.4e}")

Partial Correlation between True Response and Predicted Response: 0.0013 Partial Correlation (Pingouin): r=-0.0250, p=4.3229e-01

raphaelvallat commented 2 months ago

Thanks for the additional details.

I think that the issue in the categorical example is that the resulting covariance matrix is rank-deficient (unless you add some random noise). This is because your input data has very strong multi-collinearity.

data = df_encoded.copy()
# data["Predicted_Response"] += np.random.normal(loc=0, scale=0.00001, size=n_samples)
V = data.cov(numeric_only=True)
print(np.linalg.matrix_rank(V), data.shape[1])
12 13

As a general solution, we could perhaps raise a warning or error if the covariance matrix does not have full rank