claassenlab / MORESCA

This repository provides a template and some resources on standardized scRNA-seq analysis using Python.
GNU Affero General Public License v3.0
0 stars 0 forks source link

Stable clustering alternative #34

Open mbruhns opened 11 months ago

mbruhns commented 11 months ago

There are several methods published that claim to find more stable cluster by evaluating different parameter combinations (aka consensus clustering). One example is SC3S. We might want to use this or something similear instead of finding metrics for n_neighbors and resolution.

mbruhns commented 11 months ago

Another published approach for this would be this. It should be fairly easy to reimplement in Python, here's a draft:

import numpy as np
import scanpy as sc
from sklearn.metrics import silhouette_samples
from scipy.stats import bootstrap

# Load and preprocess data (replace with your data file)
# adata = sc.read('your_data_file.h5ad')
# For illustration, let's use a sample dataset
adata = sc.datasets.pbmc68k_reduced()

# Normalize and log-transform data
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)

# Function for repeated clustering
def repeated_clustering(adata, n_repeats=100, subset_fraction=0.8, resolution=1.0, n_neighbors=15):
    np.random.seed(42)  # for reproducibility
    original_indices = np.arange(adata.n_obs)
    co_cluster_matrix = np.zeros((adata.n_obs, adata.n_obs))

    for _ in range(n_repeats):
        subset_indices = np.random.choice(original_indices, int(len(original_indices) * subset_fraction), replace=False)
        subset_adata = adata[subset_indices]
        sc.pp.neighbors(subset_adata, n_neighbors=n_neighbors)
        sc.tl.leiden(subset_adata, resolution=resolution)

        for i in subset_indices:
            for j in subset_indices:
                if subset_adata.obs['leiden'][i] == subset_adata.obs['leiden'][j]:
                    co_cluster_matrix[i, j] += 1

    co_cluster_matrix /= n_repeats
    return co_cluster_matrix

# Function to calculate silhouette scores
def calculate_silhouette_scores(adata, distance_matrix):
    silhouette_scores = silhouette_samples(distance_matrix, adata.obs['leiden'], metric='precomputed')
    cluster_silhouette_scores = {cluster: np.mean(silhouette_scores[adata.obs['leiden'] == cluster]) for cluster in adata.obs['leiden'].unique()}
    return cluster_silhouette_scores

# Function to explore parameter values
def explore_parameter_values(adata, resolution_range, neighbors_range, n_repeats=100, subset_fraction=0.8):
    results = {}
    for resolution in resolution_range:
        for n_neighbors in neighbors_range:
            sc.pp.neighbors(adata, n_neighbors=n_neighbors)
            sc.tl.leiden(adata, resolution=resolution)
            co_cluster_matrix = repeated_clustering(adata, n_repeats, subset_fraction, resolution=resolution, n_neighbors=n_neighbors)
            distance_matrix = 1 - co_cluster_matrix
            cluster_silhouette_scores = calculate_silhouette_scores(adata, distance_matrix)
            mean_silhouette_score = np.mean(list(cluster_silhouette_scores.values()))
            results[(resolution, n_neighbors)] = mean_silhouette_score
    return results

# Function to calculate confidence interval
def calculate_confidence_interval(scores):
    ci_lower, ci_upper = bootstrap((scores,), np.median, confidence_level=0.95, n_resamples=25000, method='bca').conf_interval
    return ci_lower, ci_upper

# Function to determine optimal parameter combination
def determine_optimal_parameters(results):
    decision_threshold = max(ci_lower for _, ci_lower in results.values())
    optimal_parameters = max(results.keys(), key=lambda x: results[x][0] if results[x][1] >= decision_threshold else float('-inf'))
    return optimal_parameters

# Main execution
resolution_range = np.linspace(0.1, 2.0, 10)  # Example range of resolution parameters
neighbors_range = np.arange(10, 50, 10)  # Example range of n_neighbors parameters

exploration_results = explore_parameter_values(adata, resolution_range, neighbors_range)

# Calculate confidence intervals for each parameter combination
ci_results = {params: calculate_confidence_interval([exploration_results[params]]) for params in exploration_results.keys()}

# Determine the near-optimal parameter combination
optimal_params = determine_optimal_parameters(ci_results)

print(f"Optimal clustering parameters: Resolution - {optimal_params[0]}, n_neighbors - {optimal_params[1]}")