Closed ShixiangWang closed 3 years ago
NNLS, COSINE 是找到最相似的 signature,NNLS 是计算 exposure?结果值是什么样的?怎么合并,如 SigMA score?
怎么降采样,怎么评估不同采样下的效果?
怎么广泛应用到不同的 cancer 不同的 signatures 中去?
#' Calculates likelihood of the genome with respect to the available
#' signature probability distributions
#'
#' @param spectrum is the mutational spectrum
#' @param signatures is the reference signature catalog with the
#' probability distributions
#' @param counts is the number of cases in each cluster that is
#' represented in the catalog. They are used as weights for each
#' signature in the catalog
#' @param normalize is true by default only for when it is used
#' together with NNLS in the match_to_catalog function it is not
#' normalized here but outside of the function
calc_llh <- function(spectrum,
signatures,
counts = NULL,
normalize = T){
eps <- 0.00001
probs <- apply(signatures, 2,
function(x, pow){
if(sum(is.na(x)) > 0) return(0)
prob <- 0
for(i in 1:length(x)){
if(x[[i]] == 0) x[[i]] <- eps
if(pow[[i]] > 0){
prob <- prob + pow[[i]] * log(x[[i]])
}
}
return(prob)
},
pow = spectrum)
if(normalize){ # normalizes the total prob for all clusters to be 1
if(is.null(counts) | length(counts) != dim(signatures)[[2]]){
mean_probs <- mean(probs)
q1_probs <- mean(probs[probs < mean(probs)])
max_probs <- max(probs)
# often having too small values leads to infinities so
# only the probabilities in the first quartile are kept
# to be nonzero
inds_keep <- which(probs >= q1_probs)
inds_rm <- which(probs < q1_probs)
probs[inds_rm] <- 0
probs[inds_keep] <- exp(probs[inds_keep] - max_probs)
probs[inds_keep] <- probs[inds_keep]/sum(probs[inds_keep])
}
else{
mean_probs <- mean(probs)
q1_probs <- mean(probs[probs < mean(probs)])
max_probs <- max(probs)
# often having too small values leads to infinities so
# only the probabilities in the first quartile are kept
# to be nonzero
inds_keep <- which(probs >= q1_probs)
inds_rm <- which(probs < q1_probs)
probs[inds_rm] <- 0
probs[inds_keep] <- exp(probs[inds_keep] - max_probs)
probs <- as.numeric(unlist(probs))*as.numeric(unlist(counts))
probs[inds_keep] <- probs[inds_keep]/sum(probs[inds_keep])
}
}
ind_max <- which(max(probs) == probs)
# signature with highest likelihood
if(length(ind_max) > 0) ind_max <- ind_max[[1]]
sig_max <- colnames(signatures)[[ind_max]]
max_val <- probs[[ind_max]]
names(probs) <- colnames(signatures)
return(list(probs = probs, ind_max = ind_max, sig_max = sig_max, max_val = max_val))
}
#' Decomposes the mutational spectrum of a genome in terms of
#' tumor type specific signatures that were calculated through analysis
#' of public WGS samples from ICGC and TCGA consortia, and contained
#' as a list in the package. Non-negative-least squares algorithm
#' is used and the number of signatures to be considered in the
#' decomposition is increased gradually, first all pairs from among the
#' available signatures are considered and minimal error pair is kept.
#' Then all 3-signature combinations, 4-signature combinations and so on
#' are considered. The result is updated if the error is smaller with
#' larger number of signatures
#'
#' @param spect composite spectrum that is being decomposed
#' @param signatures a data.frame that contains the signatures in its
#' columns
#' @param data sequencing platform that as in run(), used for setting
#' the maximum number of signatures that is allowed in the decomposition
decompose <- function(spect, signatures, data, nloop_user = NULL, delta=0){
# calculates frobenius error
error <- function(spect, signatures, exposures){
reco <- (as.matrix(signatures) %*% exposures)
error_frac <- sqrt(sum(diag((spect - reco) %*% t(spect - reco))))/sqrt(sum(diag(spect %*% t(spect))))
return(error_frac)
}
dim2 <- dim(signatures)[[2]]
if(dim2 == 2){
exps <- coef(nnls::nnls(as.matrix(signatures), spect))
inds <- which(exps != 0)
exps <- exps[inds]
sigs <- colnames(signatures)[inds]
error <- error(spect, signatures[, sigs], exps)
return(list(signatures = sigs,
exposures = exps,
error = error))
}
min_error <- 10000000
min_indices <- NULL
min_exposures <- NULL
if(!is.null(nloop_user)) nloop <- min(dim(signatures)[[2]], nloop_user)
else{
nloop <- min(dim(signatures)[[2]], 8)
# increasing nloop to too large values causes overfitting
# maximum 5 signatures are considered for panels and WES
if(data == "wgs_pancan"){
nloop <- 6
}
else if(data == "tcga_mc3"){
if(dim(signatures)[[2]] > 10){
nloop <- 8
}
else{
nloop <- min(dim(signatures)[[2]], 7)
}
}
else if(data == "seqcap" | data == "seqcap_probe"){
nloop <- min(dim(signatures)[[2]], 5)
}
else if(data == "msk" | data == "fo" | data == "op")
nloop <- min(nloop, 5)
}
if(nloop < 2)
stop('linear decomposition requires at least 2 signatures')
for(i in 1:(dim2 - nloop + 1)){
for(j in (i+1):(dim2 - nloop + 2)){
if(nloop >= 3){
for(k in (j+1):(dim2 - nloop + 3)){
if(nloop >= 4){
for(l in (k+1):(dim2 - nloop + 4)){
if(nloop >= 5){
for(m in (l+1):(dim2 - nloop + 5)){
if(nloop >= 6){
for(n in (m+1):(dim2 - nloop + 6)){
if(nloop >= 7){
for(o in (n+1):(dim2 - nloop + 7)){
if(nloop >= 8){
for(p in (o+1):(dim2 - nloop + 8)){
if(nloop >= 9){
for(r in (p+1):(dim2 - nloop + 9)){
exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l, m, n, o, p, r)]), spect))
error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l, m, n, o, p, r)]), exposures)
if(error_this < min_error - delta){
min_error <- error_this
min_indices <- c(i, j, k, l, m, n, o, p, r)
min_exposures <- exposures
}
}
}
else{
exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l, m, n, o, p)]), spect))
error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l, m, n, o, p)]), exposures)
if(error_this < min_error - delta){
min_error <- error_this
min_indices <- c(i, j, k, l, m, n, o, p)
min_exposures <- exposures
}
}
}
}
else{
exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l, m, n, o)]), spect))
error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l, m, n, o)]), exposures)
if(error_this < min_error - delta){
min_error <- error_this
min_indices <- c(i, j, k, l, m, n, o)
min_exposures <- exposures
}
}
}
}else{
exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l, m, n)]), spect))
error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l, m, n)]), exposures)
if(error_this < min_error - delta){
min_error <- error_this
min_indices <- c(i, j, k, l, m, n)
min_exposures <- exposures
}
}
}
}else{
exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l, m)]), spect))
error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l, m)]), exposures)
if(error_this < min_error - delta){
min_error <- error_this
min_indices <- c(i, j, k, l, m)
min_exposures <- exposures
}
}
}
}else{
exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k, l)]), spect))
error_this <- error(spect, as.matrix(signatures[, c(i, j, k, l)]), exposures)
if(error_this < min_error - delta){
min_error <- error_this
min_indices <- c(i, j, k, l)
min_exposures <- exposures
}
}
}
}else{
exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j, k)]), spect))
error_this <- error(spect, as.matrix(signatures[, c(i, j, k)]), exposures)
if(error_this < min_error - delta){
min_error <- error_this
min_indices <- c(i, j, k)
min_exposures <- exposures
}
}
}
}else{
exposures <- coef(nnls::nnls(as.matrix(signatures[,c(i, j)]), spect))
error_this <- error(spect, as.matrix(signatures[, c(i, j)]), exposures)
if(error_this < min_error - delta){
min_error <- error_this
min_indices <- c(i, j)
min_exposures <- exposures
}
}
}
}
return(list(signatures = colnames(signatures)[min_indices],
exposures = min_exposures,
error = min_error))
}
/remind me to design and add valuable thoughts in sigMa to sigminer in a week
@ShixiangWang set a reminder for Sep 1st 2020
@taoziyu97 参考 signatures 表格有做好吗?
@taoziyu97 参考 signatures 表格有做好吗?
按照不同数据来源(cosmic,nonPCAWG,PCAWG,TCGA)整理好了不同cancer type含有的signature。
@taoziyu97 你把这 4 个表放到一个列表下面,用 saveRDS()
保存为 rds
文件放到 https://github.com/ShixiangWang/sigminer/tree/master/inst/extdata 这个目录下,然后提交一个 PR。
:wave: @ShixiangWang, design and add valuable thoughts in sigMa to sigminer
/remind me to close this in 2 week
/remind me to close this in a week
目前感觉没有特别的意义
纳入或整合 SigMA 一些算法实现。
@lihm1