kharchenkolab / numbat

Haplotype-aware CNV analysis from single-cell RNA-seq
https://kharchenkolab.github.io/numbat/
Other
163 stars 23 forks source link

converting existing FACETS CNV calls from WGS to Numbat format for input to "segs_consensus_fix" #201

Closed ahwanpandey closed 2 weeks ago

ahwanpandey commented 1 month ago

Hello,

I was just wondering if you could take a look at whether I am correctly converting integer CNV calls from FACETS for WGS data into the correct cnv_states for Numbat segs_consensus_fix? The cytoband_file input to the function is just to make the segment annotation a bit more meaningful. You don't have to go through the code, just thought I'd put it here in case someone finds it useful.

Thanks! Ahwan

INTEGER CNV values TO NUMBAT STATES, nA = major copy number, nB = minor copy number

, , cnv_state = bdel

   nB
nA   0
  0 13

, , cnv_state = neu

   nB
nA   1
  1 20

, , cnv_state = del

   nB
nA  0
  1 7

, , cnv_state = bamp

   nB
nA  2 3
  2 8 0
  3 0 6

, , cnv_state = amp

    nB
nA    0  1  2  3
  2   0 27  0  0
  3   6 10 17  0
  4   0  0  0  1
  5   0  4  4  0
  6   0  3  1  0
  7   0  0  2  0
  8   0  1  1  0
  9   0  2  0  0
  11  0  2  0  0
  13  0  0  2  0
  14  0  0  2  0
  16  0  1  0  0

, , cnv_state = loh

   nB
nA   0
  2 17

THE FUNCTION

suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(dplyr))
options(scipen = 999)

# cytoband info
makeCNVinputFromFACETS <- function(cnv_segment_file, cytoband_file){

  # read segment table
  segmentTable = fread(cnv_segment_file, sep = "\t", stringsAsFactors = FALSE)

  # only keep autosomes
  segmentTable <- segmentTable %>% dplyr::filter(chr %in% as.character(c(1:22)))

  # define states
  cnv_state <- rep(NA,nrow(segmentTable))
  cnv_state[segmentTable$tcn == 0] <- "bdel"
  cnv_state[segmentTable$tcn == 1 & segmentTable$nB == 0] <- "del"
  cnv_state[segmentTable$tcn == 2 & segmentTable$nB == 0] <- "loh"
  cnv_state[segmentTable$tcn == 2 & segmentTable$nB == 1] <- "neu"
  cnv_state[segmentTable$tcn > 2 & segmentTable$nA == segmentTable$nB] <- "bamp"
  cnv_state[segmentTable$tcn > 2 & segmentTable$nA != segmentTable$nB] <- "amp"

  segmentTable$cnv_state = cnv_state

  # # check
  # segmentTable %>% dplyr::filter(cnv_state == "bdel")
  # segmentTable %>% dplyr::filter(cnv_state == "del")
  # segmentTable %>% dplyr::filter(cnv_state == "loh")
  # segmentTable %>% dplyr::filter(cnv_state == "neu")
  # segmentTable %>% dplyr::filter(cnv_state == "bamp")
  # segmentTable %>% dplyr::filter(cnv_state == "amp")
  # segmentTable %>% dplyr::select(nA, nB, cnv_state) %>% table()

  # read cytoband file
  cytobands_dt = fread(cytoband_file, sep = "\t", stringsAsFactors = FALSE, header = FALSE)
  cytobands_dt$V6 <- as.character(substring(cytobands_dt$V4, 1,1))
  cytobands_dt$V1 <- gsub("chr","",cytobands_dt$V1)
  colnames(cytobands_dt) <- c("chrom", "start.pos", "end.pos", "region","whatever", "arm")

  # merge the segmentation data and the cytoband
  setkey(cytobands_dt, chrom, start.pos, end.pos)
  tmp = foverlaps(
    segmentTable, cytobands_dt, type="any", mult="first", nomatch=NA, 
    by.x = c("chr", "startpos", "endpos"),
    by.y = c("chrom","start.pos","end.pos")
  )

  # final table for numbat
  segmentTable_numbat <- data.frame(
    CHROM = tmp$chr,
    seg = paste(
      paste("seg", 1:nrow(tmp), sep = ''),
      paste("chr",segmentTable$chr, sep = ""),
      tmp$region,
      sep = "_"
    ),
    seg_start = tmp$startpos,
    seg_end = tmp$endpos,
    cnv_state = tmp$cnv_state,
    stringsAsFactors = FALSE
  )

  return(list(segmentTable_original = segmentTable, 
              segmentTable_numbat = segmentTable_numbat,
              cytobands_dt = cytobands_dt))
}

RUNNING THE FUNCTION

# input data ---
cnv_segment_file = "/researchers/ahwan.pandey/Projects/WholeGenome/Project_DG/ICGC/analysis_08_10_2018_Somatic/AN_T_AOCS-080-1-9_N_AOCS-080-5-3/CNV_CnvFacets/Somatic/cv50_1000.nb500.dp15_5k/AN_T_AOCS-080-1-9_N_AOCS-080-5-3.cv50_1000.nb500.dp15_5k.for_hrd.txt"
cytoband_file = "/researchers/ahwan.pandey/Projects/snRNAseq/Project_NB/PrimaryTumour_WGD/analysis_30_06_2023/GRCh38_Numbat/scripts/resources/cytoBand_hg19.txt"

# peek at the inputs
fread(cnv_segment_file, sep = "\t") %>% head()
         sample    chr startpos    endpos nProbes   tcn    nA    nB   ploidy cellularity
         <char> <char>    <int>     <int>   <int> <int> <int> <int>    <num>       <num>
1: AOCS-080-1-9      1    46700     92875      57     0     0     0 3.021462   0.8461613
2: AOCS-080-1-9      1    94000   1390144    1656     2     1     1 3.021462   0.8461613
3: AOCS-080-1-9      1  1390886  47014408   88467     2     1     1 3.021462   0.8461613
4: AOCS-080-1-9      1 47015073  83650075   73068     2     1     1 3.021462   0.8461613
5: AOCS-080-1-9      1 83650450  83806878     310     1     1     0 3.021462   0.8461613
6: AOCS-080-1-9      1 83807778 142796500   73973     2     1     1 3.021462   0.8461613

# peek at the inputs
fread(cytoband_file, sep = "\t") %>% head()
       V1       V2       V3     V4     V5
   <char>    <int>    <int> <char> <char>
1:   chr1        0  2300000 p36.33   gneg
2:   chr1  2300000  5400000 p36.32 gpos25
3:   chr1  5400000  7200000 p36.31   gneg
4:   chr1  7200000  9200000 p36.23 gpos25
5:   chr1  9200000 12700000 p36.22   gneg
6:   chr1 12700000 16200000 p36.21 gpos50

# get the states
test_res = makeCNVinputFromFACETS(cnv_segment_file, cytoband_file)

# peek at the results
test_res$segmentTable_original %>% head()
         sample    chr startpos    endpos nProbes   tcn    nA    nB   ploidy cellularity cnv_state
         <char> <char>    <int>     <int>   <int> <int> <int> <int>    <num>       <num>    <char>
1: AOCS-080-1-9      1    46700     92875      57     0     0     0 3.021462   0.8461613      bdel
2: AOCS-080-1-9      1    94000   1390144    1656     2     1     1 3.021462   0.8461613       neu
3: AOCS-080-1-9      1  1390886  47014408   88467     2     1     1 3.021462   0.8461613       neu
4: AOCS-080-1-9      1 47015073  83650075   73068     2     1     1 3.021462   0.8461613       neu
5: AOCS-080-1-9      1 83650450  83806878     310     1     1     0 3.021462   0.8461613       del
6: AOCS-080-1-9      1 83807778 142796500   73973     2     1     1 3.021462   0.8461613       neu

# peek at the results ( this will be the input to Numbat segs_consensus_fix)
test_res$segmentTable_numbat %>% head()
  CHROM              seg seg_start   seg_end cnv_state
1     1 seg1_chr1_p36.33     46700     92875      bdel
2     1 seg2_chr1_p36.33     94000   1390144       neu
3     1 seg3_chr1_p36.33   1390886  47014408       neu
4     1    seg4_chr1_p33  47015073  83650075       neu
5     1  seg5_chr1_p31.1  83650450  83806878       del
6     1  seg6_chr1_p31.1  83807778 142796500       neu

# print out conversion
for (state_ in unique(test_res$segmentTable_original$cnv_state)){
  test_res$segmentTable_original %>% 
    dplyr::select(nA, nB, cnv_state) %>%
    dplyr::filter(cnv_state == state_) %>%
    table() %>%
    print()
}
teng-gao commented 2 weeks ago

Looks good. Thanks for sharing