kharchenkolab / numbat

Haplotype-aware CNV analysis from single-cell RNA-seq
https://kharchenkolab.github.io/numbat/
Other
169 stars 23 forks source link

Mouse data. "subscript out of bounds" error. #157

Open zackb9 opened 10 months ago

zackb9 commented 10 months ago

Getting an error when running numbat on mouse data. The same script works with different mouse samples, but not with new set of samples. Have used reference generated from the sample itself, with no success. It is for some reason filtering most of the genes (there are ~20,000 in the original seurat object, only 75 are left by numbat). Full code and error below. Is this an issue with our data? Thanks for your help!!!

Approximating initial clusters using smoothed expression .. Mem used: 2.46Gb number of genes left: 75 Error in h(simpleError(msg, call)) : error in evaluating the argument 'x' in selecting a method for function 't': subscript out of bounds

library(dplyr)
library(data.table)
library(numbat)
library(stringr)
library(glue)
library(Seurat)
library(Matrix)

pat = '20'
pu_dir = paste0(pat,'/pileup')

vcf_pu = fread(glue('{pu_dir}/cellSNP.base.vcf'), skip = '#CHROM') %>% 
  rename(CHROM = `#CHROM`) %>%
  mutate(snp_id = paste(CHROM, POS, REF, ALT, sep = '_')) %>%
  mutate(CHROM = str_remove(CHROM, 'chr')) %>%
  mutate(CHROM = factor(CHROM, unique(CHROM))) %>%
  filter(CHROM != 'X')

vcf_phased = vcf_pu %>% mutate(GT = '1|0', cM = 0)

# pileup count matrices
AD = readMM(glue('{pu_dir}/cellSNP.tag.AD.mtx'))
DP = readMM(glue('{pu_dir}/cellSNP.tag.DP.mtx'))
barcodes = fread(glue('{pu_dir}/cellSNP.samples.tsv'), header = F)$V1

# convert to dataframe
DP = as.data.frame(Matrix::summary(DP)) %>%
  mutate(
    cell = barcodes[j],
    snp_id = vcf_pu$snp_id[i]
  ) %>%
  select(-i, -j) %>%
  rename(DP = x) %>%
  select(cell, snp_id, DP)

AD = as.data.frame(Matrix::summary(AD)) %>%
  mutate(
    cell = barcodes[j],
    snp_id = vcf_pu$snp_id[i]
  ) %>%
  select(-i, -j) %>%
  rename(AD = x) %>%
  select(cell, snp_id, AD)

df_allele = DP %>% left_join(AD, by = c("cell", "snp_id")) %>%
  mutate(AD = ifelse(is.na(AD), 0, AD))

# attach genotype info
df_allele = df_allele %>% inner_join(
  vcf_phased %>% select(snp_id, CHROM, POS, REF, ALT, GT, cM),
  by = 'snp_id')

## sample matrix

seu <- readRDS(paste0(pat,'/seu_processed.rds'))
count_matrix <- seu@assays$RNA@counts
output_file <- "count_matrixseu.mtx.gz"
writeMM(count_matrix, file = output_file)
count_mat = readMM('count_matrixseu.mtx.gz')
colnames(count_mat) = colnames(seu)
rownames(count_mat) = rownames(seu@assays$RNA@counts)
count_mat = as.matrix(count_mat)
count_mat = rowsum(count_mat, rownames(count_mat))
count_mat = as(count_mat, "dgCMatrix")
count_mat_tumor = count_mat

## expression reference

#refseu <- subset(seu, idents = 'Naive B Cells')
count_matrix <- refseu@assays$RNA@counts
output_file <- "count_matrix.mtx.gz"
writeMM(count_matrix, file = output_file)
count_mat = readMM('count_matrix.mtx.gz')
colnames(count_mat) = colnames(refseu)
rownames(count_mat) = rownames(refseu@assays$RNA@counts)
count_mat = as.matrix(count_mat)
count_mat = rowsum(count_mat, rownames(count_mat))
count_mat = as(count_mat, "dgCMatrix")
count_mat_ref = count_mat

p2 = pagoda2::basicP2proc(count_mat_ref, n.cores = 30)

clusters = p2$clusters$PCA$multilevel

ref_annot = data.frame(
  cell = names(clusters),
  group = unname(clusters)
)

ref_pancreas = numbat::aggregate_counts(
  count_mat_ref,
  ref_annot %>% group_by(group) %>% filter(n() > 100)
)

## numbat

run_numbat(
  count_mat_tumor, 
  ref_pancreas, 
  df_allele,
  t = 1e-5,
  ncores = 20,
  skip_nj = TRUE,
  min_LLR = 30,
  out_dir = './results',
  # mouse specific settings
  genome = "mm10",
  nu = 0
)
teng-gao commented 10 months ago

Hmm, not entirely sure what's happening here without the data but here's the function for filtering genes:

https://github.com/kharchenkolab/numbat/blob/45d578a812748a3db970a025005ec9daa35827df/R/utils.R#L131-L167

You can test it outside of the main loop by giving this function ref_pancreas and count_mat_tumor to see where exactly these genes are filtered out.