Preprocessing of single cell reference data

Ceyhun-Alar commented 4 months ago

Can you please share the code for processing reference data to obtain sc_references/sc_breast.csv, sc_breast_markers_pos.csv, and sc_breast_markers_neg.csv?

diala-ar commented 3 months ago

I second @Ceyhun-Alar request, as I was not able to reproduce the positive and negative markers for the Breast dataset. I really appreciate if you could share your code to produce those markers. Thanks

xhelenfu commented 3 months ago

Hi, please try this code to get the positive and negative marker .csv files from the reference data

import numpy as np
import argparse
import pandas as pd
import natsort

def main(config):

    ref_df = pd.read_csv(config.fp_ref, index_col=0)
    n_genes = ref_df.shape[1] - 3
    print("Ref data shape", ref_df.shape)

    cell_types = ref_df["cell_type"].tolist()
    cell_types = natsort.natsorted(list(set(cell_types)))
    print(cell_types)
    n_cell_types = len(cell_types)

    ref_expr = ref_df.iloc[:, :n_genes].to_numpy()
    gene_names = ref_df.columns[:n_genes]

    # Find genes with expressions in bottom 10% percentile for every ref cell type
    pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True)
    pct_10 = np.tile(pct_10, (1, n_genes))
    low_expr_true = np.zeros(pct_10.shape)
    low_expr_true[ref_expr <= pct_10] = 1

    # Find overlap for different ref samples of the same cell type
    ct_idx = ref_df["ct_idx"].to_numpy()
    low_expr_true_agg = np.zeros((n_cell_types, n_genes))
    for ct in range(n_cell_types):
        rows = np.where(ct_idx == ct)[0]
        low_expr_true_ct = low_expr_true[rows]
        low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0)

    # Set overlaps to 0
    overlaps = np.sum(low_expr_true_agg, 0)
    too_many = np.where(overlaps > config.max_overlaps_neg)[0]
    low_expr_true_agg[:, too_many] = 0

    # print("num neg genes per cell type")
    # print(np.sum(low_expr_true_agg, 1))

    df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names)

    # Find genes with expressions in top 90% percentile for every ref cell type
    pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True)
    pct_90 = np.tile(pct_90, (1, n_genes))
    high_expr_true = np.zeros(pct_90.shape)
    high_expr_true[ref_expr >= pct_90] = 1

    # Find overlap for different ref samples of the same cell type
    ct_idx = ref_df["ct_idx"].to_numpy()
    high_expr_true_agg = np.zeros((n_cell_types, n_genes))
    for ct in range(n_cell_types):
        rows = np.where(ct_idx == ct)[0]
        high_expr_true_ct = high_expr_true[rows]
        high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0)

    # print("num pos genes per cell type")
    # print(np.sum(high_expr_true_agg, 1))

    # Set overlaps to 0
    overlaps = np.sum(high_expr_true_agg, 0)
    too_many = np.where(overlaps > config.max_overlaps_pos)[0]
    high_expr_true_agg[:, too_many] = 0

    df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names)

    df_pos.to_csv(config.fp_pos)
    df_neg.to_csv(config.fp_neg)

    print("Done")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--fp_ref",
        default="data/sc_references/sc_breast.csv",
        type=str,
        help="ref data",
    )

    parser.add_argument(
        "--max_overlaps_pos",
        default=4,
        type=int,
        help="no more than n cell types can have the same markers",
    )

    parser.add_argument(
        "--max_overlaps_neg",
        default=15,
        type=int,
        help="no more than n cell types can have the same markers",
    )

    parser.add_argument(
        "--fp_pos", default="markers_pos.csv", type=str, help="positive markers"
    )
    parser.add_argument(
        "--fp_neg", default="markers_neg.csv", type=str, help="negative markers"
    )

    config = parser.parse_args()
    main(config)

diala-ar commented 3 months ago

Hi @xhelenfu,

Thanks so much for sharing the code, it worked!

However, I wrote an R code to generate the sc_breast.csv file from the averages gene expression BRCA_xxx_expression_Celltype_majorlineage.txt files downloaded from the TISCH2 website, but I was not able to regenerate it.

Briefly the code: 1) renames the CD8T and CD8Tex cell types to CD8T/CD8Tex; 2) renames CD4 and Treg cell types to CD4Tconv/Treg and 3) excludes the Tprolif and Oligodendrocyte cell types. The correlation between my generated numbers and yours are pretty high (>0.9). Below is my code:

in_dir = "resources/Xenium_data/output-sample1/"
features = read.delim(file.path(in_dir, 'cell_feature_matrix', 'features.tsv.gz'), header = F)
names(features) = c('Ensembl', 'gene_symbol', 'gene_category')
genes = features$gene_symbol[features$gene_category=='Gene Expression']

### 02. generate average expression of reference studies of BRCA cancer
tumor_subtype = 'BRCA'
in_dir = file.path('resources', 'from_TISCH2', 'avg_expr')
in_dirs = list.dirs(in_dir, recursive=F)
in_dirs = in_dirs[grep(tumor_subtype, in_dirs)]
all_avg_expr = NULL
for (i in 1:length(in_dirs)) {
  in_dir_i = in_dirs[i]

  atlas = sub(paste0(tumor_subtype, '_(.+)_Expression'), '\\1', basename(in_dir_i))

  file_names = list.files(in_dir_i)
  file_name = grep('Celltype_majorlineage', file_names, value=T)
  avg_expr = read.delim(file.path(in_dir_i, file_name))
  avg_expr = data.frame(t(avg_expr))

  found_genes   = intersect(genes, names(avg_expr))
  missing_genes = setdiff(genes, names(avg_expr))
  missing_genes_expr = data.frame(matrix(0, nrow=nrow(avg_expr), ncol=length(missing_genes), 
                                  dimnames=list(NULL, missing_genes)))
  avg_expr = cbind(avg_expr[, found_genes], missing_genes_expr)
  avg_expr = avg_expr[, order(names(avg_expr))]
  avg_expr$cell_type = row.names(avg_expr)

  avg_expr[, (ncol(avg_expr)-3):ncol(avg_expr)]

  CD8_celltypes = grep('CD8', avg_expr$cell_type)
  avg_expr$cell_type[CD8_celltypes] = 'CD8T/CD8Tex'

  CD4_or_Treg_celltypes = grep('CD4|Treg', avg_expr$cell_type)
  avg_expr$cell_type[CD4_or_Treg_celltypes] = 'CD4T/Treg'
  avg_expr$atlas = atlas

  avg_expr[, (ncol(avg_expr)-3):ncol(avg_expr)]
  all_avg_expr = rbind(all_avg_expr, avg_expr)
}

## some cleaning
# remove columns having an expression of 0 for all cell types
rm_col = which(apply(all_avg_expr[, -which(names(all_avg_expr) %in% c("cell_type", 'atlas'))], 2, sum)==0)
if ( length(rm_col) > 0 )
  all_avg_expr = all_avg_expr[, -rm_col]

# remove rows with T profil
all_avg_expr = all_avg_expr[!all_avg_expr$cell_type %in% c('Tprolif', 'TProlif', 'Oligodendrocyte'), ]
all_avg_expr[all_avg_expr$cell_type=='Mono.Macro', 'cell_type'] = 'Mono/Macro'
row.names(all_avg_expr) = NULL

## add the cell_type index column ct_idx
cell_types = sort(unique(all_avg_expr$cell_type))
ct_idx     = 0:(length(cell_types)-1)
names(ct_idx) = cell_types
all_avg_expr = data.frame(all_avg_expr[, 1:(ncol(all_avg_expr)-2)],
                          ct_idx = sapply(all_avg_expr$cell_type, function(x) ct_idx[x]),
                          cell_type = all_avg_expr$cell_type,
                          atlas = all_avg_expr$atlas)
all_avg_expr = all_avg_expr[order(all_avg_expr$atlas, all_avg_expr$cell_type), ]
all_avg_expr[, (ncol(all_avg_expr)-3):ncol(all_avg_expr)]
write.csv(all_avg_expr, file=file.path('results', 'bidcell', 'input', 'sc_BRCA.csv'))

I highly appreciate if you could share your code to generate sc_breast.csv.

Thanks.

SydneyBioX / BIDCell

Preprocessing of single cell reference data #9