Open mbruhns opened 11 months ago
Another published approach for this would be this. It should be fairly easy to reimplement in Python, here's a draft:
import numpy as np
import scanpy as sc
from sklearn.metrics import silhouette_samples
from scipy.stats import bootstrap
# Load and preprocess data (replace with your data file)
# adata = sc.read('your_data_file.h5ad')
# For illustration, let's use a sample dataset
adata = sc.datasets.pbmc68k_reduced()
# Normalize and log-transform data
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)
# Function for repeated clustering
def repeated_clustering(adata, n_repeats=100, subset_fraction=0.8, resolution=1.0, n_neighbors=15):
np.random.seed(42) # for reproducibility
original_indices = np.arange(adata.n_obs)
co_cluster_matrix = np.zeros((adata.n_obs, adata.n_obs))
for _ in range(n_repeats):
subset_indices = np.random.choice(original_indices, int(len(original_indices) * subset_fraction), replace=False)
subset_adata = adata[subset_indices]
sc.pp.neighbors(subset_adata, n_neighbors=n_neighbors)
sc.tl.leiden(subset_adata, resolution=resolution)
for i in subset_indices:
for j in subset_indices:
if subset_adata.obs['leiden'][i] == subset_adata.obs['leiden'][j]:
co_cluster_matrix[i, j] += 1
co_cluster_matrix /= n_repeats
return co_cluster_matrix
# Function to calculate silhouette scores
def calculate_silhouette_scores(adata, distance_matrix):
silhouette_scores = silhouette_samples(distance_matrix, adata.obs['leiden'], metric='precomputed')
cluster_silhouette_scores = {cluster: np.mean(silhouette_scores[adata.obs['leiden'] == cluster]) for cluster in adata.obs['leiden'].unique()}
return cluster_silhouette_scores
# Function to explore parameter values
def explore_parameter_values(adata, resolution_range, neighbors_range, n_repeats=100, subset_fraction=0.8):
results = {}
for resolution in resolution_range:
for n_neighbors in neighbors_range:
sc.pp.neighbors(adata, n_neighbors=n_neighbors)
sc.tl.leiden(adata, resolution=resolution)
co_cluster_matrix = repeated_clustering(adata, n_repeats, subset_fraction, resolution=resolution, n_neighbors=n_neighbors)
distance_matrix = 1 - co_cluster_matrix
cluster_silhouette_scores = calculate_silhouette_scores(adata, distance_matrix)
mean_silhouette_score = np.mean(list(cluster_silhouette_scores.values()))
results[(resolution, n_neighbors)] = mean_silhouette_score
return results
# Function to calculate confidence interval
def calculate_confidence_interval(scores):
ci_lower, ci_upper = bootstrap((scores,), np.median, confidence_level=0.95, n_resamples=25000, method='bca').conf_interval
return ci_lower, ci_upper
# Function to determine optimal parameter combination
def determine_optimal_parameters(results):
decision_threshold = max(ci_lower for _, ci_lower in results.values())
optimal_parameters = max(results.keys(), key=lambda x: results[x][0] if results[x][1] >= decision_threshold else float('-inf'))
return optimal_parameters
# Main execution
resolution_range = np.linspace(0.1, 2.0, 10) # Example range of resolution parameters
neighbors_range = np.arange(10, 50, 10) # Example range of n_neighbors parameters
exploration_results = explore_parameter_values(adata, resolution_range, neighbors_range)
# Calculate confidence intervals for each parameter combination
ci_results = {params: calculate_confidence_interval([exploration_results[params]]) for params in exploration_results.keys()}
# Determine the near-optimal parameter combination
optimal_params = determine_optimal_parameters(ci_results)
print(f"Optimal clustering parameters: Resolution - {optimal_params[0]}, n_neighbors - {optimal_params[1]}")
There are several methods published that claim to find more stable cluster by evaluating different parameter combinations (aka consensus clustering). One example is SC3S. We might want to use this or something similear instead of finding metrics for
n_neighbors
andresolution
.