更好地支持小突变数据？

ShixiangWang commented 4 years ago

纳入或整合 SigMA 一些算法实现。

@lihm1

ShixiangWang commented 4 years ago

HRD

ShixiangWang commented 4 years ago

NNLS, COSINE 是找到最相似的 signature，NNLS 是计算 exposure？结果值是什么样的？怎么合并，如 SigMA score？

怎么降采样，怎么评估不同采样下的效果？

怎么广泛应用到不同的 cancer 不同的 signatures 中去？

lihm1 commented 4 years ago

#' Calculates likelihood of the genome with respect to the available 
#' signature probability distributions
#' 
#' @param spectrum is the mutational spectrum
#' @param signatures is the reference signature catalog with the 
#' probability distributions
#' @param counts is the number of cases in each cluster that is
#' represented in the catalog. They are used as weights for each
#' signature in the catalog
#' @param normalize is true by default only for when it is used
#' together with NNLS in the match_to_catalog function it is not 
#' normalized here but outside of the function

calc_llh <- function(spectrum, 
                     signatures, 
                     counts = NULL, 
                     normalize = T){

  eps <- 0.00001
  probs <- apply(signatures, 2,
                 function(x, pow){
                   if(sum(is.na(x)) > 0) return(0)
                   prob <- 0
                   for(i in 1:length(x)){
                     if(x[[i]] == 0) x[[i]] <- eps
                     if(pow[[i]] > 0){
                       prob <- prob + pow[[i]] * log(x[[i]])
                     }
                   }
                   return(prob)
                 },
                 pow = spectrum)

  if(normalize){ # normalizes the total prob for all clusters to be 1
    if(is.null(counts) | length(counts) != dim(signatures)[[2]]){
      mean_probs <- mean(probs)

      q1_probs <- mean(probs[probs < mean(probs)])
      max_probs <- max(probs)

      # often having too small values leads to infinities so
      # only the probabilities in the first quartile are kept
      # to be nonzero
      inds_keep <- which(probs >= q1_probs)
      inds_rm <- which(probs < q1_probs)
      probs[inds_rm] <- 0

      probs[inds_keep] <- exp(probs[inds_keep] - max_probs)
      probs[inds_keep] <- probs[inds_keep]/sum(probs[inds_keep])
    }
    else{
      mean_probs <- mean(probs)

      q1_probs <- mean(probs[probs < mean(probs)])
      max_probs <- max(probs)

      # often having too small values leads to infinities so
      # only the probabilities in the first quartile are kept
      # to be nonzero
      inds_keep <- which(probs >= q1_probs)
      inds_rm <- which(probs < q1_probs)
      probs[inds_rm] <- 0
      probs[inds_keep] <- exp(probs[inds_keep] - max_probs)
      probs <- as.numeric(unlist(probs))*as.numeric(unlist(counts))
      probs[inds_keep] <- probs[inds_keep]/sum(probs[inds_keep])
    }
  }

  ind_max <- which(max(probs) == probs)

  # signature with highest likelihood
  if(length(ind_max) > 0) ind_max <- ind_max[[1]]
  sig_max <- colnames(signatures)[[ind_max]]
  max_val <- probs[[ind_max]]

  names(probs) <- colnames(signatures)

  return(list(probs = probs, ind_max = ind_max, sig_max = sig_max, max_val = max_val))
}

#' Decomposes the mutational spectrum of a genome in terms of
#' tumor type specific signatures that were calculated through analysis
#' of public WGS samples from ICGC and TCGA consortia, and contained
#' as a list in the package. Non-negative-least squares algorithm
#' is used and the number of signatures to be considered in the 
#' decomposition is increased gradually, first all pairs from among the
#' available signatures are considered and minimal error pair is kept. 
#' Then all 3-signature combinations, 4-signature combinations and so on
#' are considered. The result is updated if the error is smaller with
#' larger number of signatures 
#'
#' @param spect composite spectrum that is being decomposed
#' @param signatures a data.frame that contains the signatures in its
#' columns
#' @param data sequencing platform that as in run(), used for setting
#' the maximum number of signatures that is allowed in the decomposition

decompose <- function(spect, signatures, data, nloop_user = NULL, delta=0){
  # calculates frobenius error
  error <- function(spect, signatures, exposures){
    reco <- (as.matrix(signatures) %*% exposures)
    error_frac <- sqrt(sum(diag((spect - reco) %*% t(spect - reco))))/sqrt(sum(diag(spect %*% t(spect))))
    return(error_frac)
  }

  dim2 <- dim(signatures)[[2]]
  if(dim2 == 2){
    exps <- coef(nnls::nnls(as.matrix(signatures), spect))
    inds <- which(exps != 0)
    exps <- exps[inds]
    sigs <- colnames(signatures)[inds]

    error <- error(spect, signatures[, sigs], exps)
    return(list(signatures = sigs,
                exposures = exps,
                error = error))
  }

  min_error <- 10000000
  min_indices <- NULL
  min_exposures <- NULL

  if(!is.null(nloop_user)) nloop <- min(dim(signatures)[[2]], nloop_user)
  else{
    nloop <- min(dim(signatures)[[2]], 8) 
    # increasing nloop to too large values causes overfitting
    # maximum 5 signatures are considered for panels and WES 
    if(data == "wgs_pancan"){ 
       nloop <- 6
    }
    else if(data == "tcga_mc3"){
      if(dim(signatures)[[2]] > 10){ 
        nloop <- 8
      }
      else{
        nloop <- min(dim(signatures)[[2]], 7)
      }
    }
    else if(data == "seqcap" | data == "seqcap_probe"){
      nloop <- min(dim(signatures)[[2]], 5)
    }
    else if(data == "msk" | data == "fo" | data == "op")
      nloop <- min(nloop, 5)
  }

  if(nloop < 2)
    stop('linear decomposition requires at least 2 signatures')

  for(i in 1:(dim2 - nloop + 1)){
    for(j in (i+1):(dim2 - nloop + 2)){
      if(nloop >= 3){
        for(k in (j+1):(dim2 - nloop + 3)){
          if(nloop >= 4){
            for(l in (k+1):(dim2 - nloop + 4)){
              if(nloop >= 5){
                for(m in (l+1):(dim2 - nloop + 5)){
                  if(nloop >= 6){
                    for(n in (m+1):(dim2 - nloop + 6)){
                      if(nloop >= 7){
                        for(o in (n+1):(dim2 - nloop + 7)){
                          if(nloop >= 8){
                            for(p in (o+1):(dim2 - nloop + 8)){
                              if(nloop >= 9){
                                for(r in (p+1):(dim2 - nloop + 9)){
                                  exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l, m, n, o, p, r)]), spect))
                                  error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l, m, n, o, p, r)]), exposures)
                                  if(error_this < min_error - delta){
                                    min_error <- error_this
                                    min_indices <- c(i, j, k, l, m, n, o, p, r)
                                    min_exposures <- exposures
                                  }
                                }
                              }
                              else{
                              exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l, m, n, o, p)]), spect))
                              error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l, m, n, o, p)]), exposures)
                              if(error_this < min_error - delta){
                                min_error <- error_this
                                min_indices <- c(i, j, k, l, m, n, o, p)
                                min_exposures <- exposures
                              }
                              }
                            }
                          }
                          else{
                            exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l, m, n, o)]), spect))
                            error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l, m, n, o)]), exposures)
                            if(error_this < min_error - delta){
                              min_error <- error_this
                              min_indices <- c(i, j, k, l, m, n, o)
                              min_exposures <- exposures
                            }
                          }
                        }
                      }else{
                        exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l, m, n)]), spect))
                        error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l, m, n)]), exposures)
                        if(error_this < min_error - delta){
                          min_error <- error_this
                          min_indices <- c(i, j, k, l, m, n)
                          min_exposures <- exposures
                        }
                      }
                    }
                  }else{
                    exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l, m)]), spect))
                    error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l, m)]), exposures)
                    if(error_this < min_error - delta){
                      min_error <- error_this
                      min_indices <- c(i, j, k, l, m)
                      min_exposures <- exposures
                    }
                  }
                }
              }else{
                exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l)]), spect))
                error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l)]), exposures)
                if(error_this < min_error - delta){
                  min_error <- error_this
                  min_indices <- c(i, j, k, l)
                  min_exposures <- exposures
                }
              }
            }
          }else{
            exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k)]), spect))
            error_this <- error(spect, as.matrix(signatures[, c(i, j, k)]), exposures)
            if(error_this < min_error - delta){
              min_error <- error_this
              min_indices <- c(i, j, k)
              min_exposures <- exposures
            }
          }
        }
      }else{
        exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j)]), spect))
        error_this <- error(spect, as.matrix(signatures[, c(i, j)]), exposures)
        if(error_this < min_error - delta){
          min_error <- error_this
          min_indices <- c(i, j)
          min_exposures <- exposures
        }
      }
    }
  }
  return(list(signatures = colnames(signatures)[min_indices],
              exposures = min_exposures,
              error = min_error))
}

ShixiangWang commented 4 years ago

/remind me to design and add valuable thoughts in sigMa to sigminer in a week

reminders[bot] commented 4 years ago

@ShixiangWang set a reminder for Sep 1st 2020

ShixiangWang commented 4 years ago

@taoziyu97 参考 signatures 表格有做好吗？

taoziyu97 commented 4 years ago

@taoziyu97 参考 signatures 表格有做好吗？

按照不同数据来源（cosmic，nonPCAWG，PCAWG，TCGA）整理好了不同cancer type含有的signature。

ShixiangWang commented 4 years ago

@taoziyu97 你把这 4 个表放到一个列表下面，用 saveRDS() 保存为 rds 文件放到 https://github.com/ShixiangWang/sigminer/tree/master/inst/extdata 这个目录下，然后提交一个 PR。

reminders[bot] commented 4 years ago

:wave: @ShixiangWang, design and add valuable thoughts in sigMa to sigminer

ShixiangWang commented 3 years ago

/remind me to close this in 2 week

ShixiangWang commented 3 years ago

/remind me to close this in a week

ShixiangWang commented 3 years ago

目前感觉没有特别的意义

ShixiangWang / sigminer

更好地支持小突变数据？ #262