tanaylab / metacell

Metacell - Single-cell mRNA Analysis
https://tanaylab.github.io/metacell
Other
108 stars 30 forks source link

import .h5ad or .loom formats #57

Open AAleotti opened 3 years ago

AAleotti commented 3 years ago

Hello, is there a way to import a dataset that is saved in .h5ad or in .loom format into metacell? Thanks! Alessandra

onionpork commented 3 years ago

I also have the same problem. Please let me know if anyone know how to fix it.

ukleiner commented 1 year ago

If you don't want to save to disk the h5 content I wrote a custom function that accepts a UMIs matrix and gene list from memory. It is a modified version of mcell_import_scmat_tsv.

  1. It expects a UMI matrix that contains ONLY UMIs (no gene_id in the first column)
  2. It expectes a pre-ordered list of genes matching the row order in the UMIs matrix The downside is that you need to load the h5 into memory, thus nullifying its advantages.
mcell_import_scmat_memory <- function(mat_nm, umis, genes, meta_fn = NULL, dset_nm=NULL,
                                   force = FALSE, paralogs_policy = "sum") 
  {
  # genes already ordered by the umis rows
    if (!scdb_is_valid()) {
      stop("MCERR - scdb is not initialized, cannot import")
    }
    if (!force & !is.null(scdb_mat(mat_nm))) {
      return(TRUE)
    }
    if (any(is.na(umis)) | is.null(umis)) {
      stop("MCERR - umi matrix empty")
    }
    if (!is.null(genes)) {
      # Remove rows without gene name
      umis = umis[!is.na(genes), ]
      # remove NA gene names
      genes = genes[!is.na(genes)]
      unique_genes = names(which(table(genes) == 1))
      # no need to remove first column, clean matrix
      umis1 = umis[genes %in% unique_genes, ]
      rownames(umis1) = genes[genes %in% unique_genes]
      if (paralogs_policy == "remove") {
        umis = umis1
      }
      else if (paralogs_policy == "sum") {
        non_unique_genes = setdiff(genes, unique_genes)
        umis2 = as.matrix(umis[genes %in% non_unique_genes, ])
        message(sprintf("summing up total of %d paralog genes into %d unique genes", 
                        nrow(umis2), length(non_unique_genes)))
        dup_genes = genes[genes %in% non_unique_genes]
        umis2s = apply(umis2, 2, function(x) {
          tapply(x, INDEX = dup_genes, FUN = sum)
        })
        umis = rbind(umis1, umis2s)
        rownames(umis) = c(rownames(umis1), rownames(umis2s))
      }
    }
    else {
      stop("MCERR - genes list empty")
    }
    if (!is.null(meta_fn)) {
      # Not sure how this is relevant to the PerturbSeq, not touching
      md = fread(meta_fn, sep = "\t")
      mandatory = c("Cell.ID", "Amp.Batch.ID", "Seq.Batch.ID", 
                    "Batch.Set.ID")
      miss_f = setdiff(mandatory, colnames(md))
      if (length(miss_f) > 0) {
        stop("MC-ERR: missing fields in matrix metadata : ", 
             miss_f)
      }
      rownames(md) = md$Cell.ID
      md = md %>% select(-Cell.ID)
    }
    else {
      if (is.null(dset_nm)) {
        stop("either provide a meta data file, or supply a dataset name (parameter dset_nm) and we'll fake it for you...")
      }
      dsets = data.frame(batch_set_id = dset_nm, amp_batch_id = dset_nm, 
                         seq_batch_id = dset_nm)
      md = matrix(c("import", dset_nm, dset_nm, dset_nm), nrow = ncol(umis), 
                  ncol = 4, byrow = T, dimnames = list(colnames(umis), 
                                                       c("type", "batch_set_id", "amp_batch_id", "seq_batch_id")))
      md = as.data.frame(md)
    }
    sparse_m = Matrix::Matrix(as.matrix(umis))
    rownames(sparse_m) = rownames(umis)
    scmat = scm_new_matrix(sparse_m, md)
    scdb_add_mat(mat_nm, scmat)
    return(TRUE)
}