Open nikou123456 opened 4 years ago
Hello Haimei,
Thank you for your contacting. It seems that there has an issue with your input GMAP file with gff3 format. AStrap uses function "trimmGMAP" to treat redundant sequence alignment and remove the invalid sequence alignments. You can run the code from "trimmGMAP" (AStrap/R/AStrap.R) to figure out what's wrong with your GMAP document. You can also share your GMAP file to me if that 's work for you.
A typical GMAP file with gff3 looks like this:
##gff-version 3
# Generated by GMAP version 2016-06-09 using call: gmap.avx2 -D ./index -d index -t 10 --min-intronlength 9 -z sense_force --min-trimmed-coverage 0.7 --min-identity 0.7 -f 2 ./example_TRsequence.fasta
AMTR4 index gene 1 1028 . + . ID=AMTR4.path1;Name=AMTR4
AMTR4 index mRNA 1 1028 . + . ID=AMTR4.mrna1;Name=AMTR4;Parent=AMTR4.path1;coverage=100.0;identity=100.0;matches=1028;mismatches=0;indels=0;unknowns=0
AMTR4 index exon 1 1028 100 + . ID=AMTR4.mrna1.exon1;Name=AMTR4;Parent=AMTR4.mrna1;Target=AMTR4 1 1028 +
AMTR4 index CDS 166 690 100 + 0 ID=AMTR4.mrna1.cds1;Name=AMTR4;Parent=AMTR4.mrna1;Target=AMTR4 166 690 +
Should you have any further questions please do not hesitate to ask. Best, Wenbin Ye
Hello,
I have some problems when I used function readGMAP, could you help me?
cluster.align <- readGMAP(gmap.path, raw.cluster, recluster = TRUE, recluster.identity = 0.7, recluster.coverage = 0.7)
Error in if (gmap.align$Alinum[m] > gmap.align$Alinum[k]) { : missing value where TRUE/FALSE needed
Best, Haimei Chen
Hello Wenbin,
Thanks a lot! I have fix the isssue when I used the gff3 file generated by GMAP version 2016-06-30 otherwise GMAP version 2020-06-30. The attachment file is my gff3 file generated by GMAP version 2020-06-30.
Best, Hiamei Chen
Hello Wenbin,
Thanks a lot! I have fix the isssue when I used the gff3 file generated by GMAP version 2016-06-30 otherwise GMAP version 2020-06-30. The attachment file is my gff3 file generated by GMAP version 2020-06-30.
Best, Hiamei Chen
Hi Hiamei Chen,
That's great to know the problem was caused by different GMAP version. Based on your GMAP file generated by 2020-06-30, I have modified related function, please see code below
#example redundant alignment
###
AMTR343 index gene 186 1065 . + . ID=AMTR1222.path2;Name=AMTR1222
AMTR343 index mRNA 186 1065 . + . ID=AMTR1222.mrna2;Name=AMTR1222;Parent=AMTR1222.path2;coverage=98.0;identity=99.9;matches=880;mismatches=0;indels=1;unknowns=0
AMTR343 index exon 186 1065 99 + . ID=AMTR1222.mrna2.exon1;Name=AMTR1222;Parent=AMTR1222.mrna2;Target=AMTR1222 1 881 +
AMTR343 index CDS 188 240 98 + 0 ID=AMTR1222.mrna2.cds1;Name=AMTR1222;Parent=AMTR1222.mrna2;Target=AMTR1222 3 56 +
###
AMTR1222 index gene 1 881 . + . ID=AMTR343.path2;Name=AMTR343
AMTR1222 index mRNA 1 881 . + . ID=AMTR343.mrna2;Name=AMTR343;Parent=AMTR343.path2;coverage=82.6;identity=99.9;matches=880;mismatches=0;indels=1;unknowns=0
AMTR1222 index exon 1 881 99 + . ID=AMTR343.mrna2.exon1;Name=AMTR343;Parent=AMTR343.mrna2;Target=AMTR343 186 1065 +
AMTR1222 index CDS 166 747 100 + 0 ID=AMTR343.mrna2.cds1;Name=AMTR343;Parent=AMTR343.mrna2;Target=AMTR343 350 931 +
###
Modified function trimmGMAP
gmap.file <- "./4CL_isoform_gmap_output_v2020.gff3"
trim.gmap <- trimmGMAP(gmap.file)
trimmGMAP <- function(gmap.file) {
options(stringsAsFactors = F)
gmap <- import.gff(gmap.file, format = "gff3",
feature.type = c("mRNA", "exon"))
gmap <- as.data.frame(gmap)
gmap$seqnames <- as.character(gmap$seqnames)
gmap$type <- as.character(gmap$type)
gmap$coverage <- as.numeric(gmap$coverage)
gmap$identity <- as.numeric(gmap$identity)
gmap$strand <- as.character(gmap$strand)
gmap <- gmap[which(gmap$strand == "+"), ]
gmap <- gmap[which(gmap$seqnames != gmap$Name), ]
mRNA.id <- which(gmap$type == "mRNA")
align <- gmap[mRNA.id, ]
align$num <- 0
align$Subject <- ""
align$Query <- ""
for (i in 1:length(mRNA.id)) {
if (i == length(mRNA.id)) {
end.idx <- nrow(gmap)
} else {
end.idx <- mRNA.id[i + 1] - 1
}
exon.num <- end.idx - mRNA.id[i]
align$num[i] <- exon.num
for (j in (mRNA.id[i] + 1):end.idx) {
start <- str_extract(gmap$Target[j], "(?<=\\s)\\d+(?=\\s\\d+)")
end <- str_extract(gmap$Target[j], "(?<=\\s)\\d+(?=\\s(\\+|\\.|\\-))")
Query.align <- paste(start, end, sep = "-")
align$Query[i] <- paste(align$Query[i], Query.align, sep = ":")
Subject.align <- paste(gmap$start[j], gmap$end[j], sep = "-")
align$Subject[i] <- paste(align$Subject[i], Subject.align, sep = ":")
}
}
gmap.align <- data.frame(Sid = as.character(align$seqnames), Qid = align$Name,
Coverage = align$coverage, identity = align$identity,
Alinum = align$num, Salign = align$Subject,
Qalign = align$Query)
gmap.align$NAME <- ""
index1 <- gmap.align$Sid > gmap.align$Qid
gmap.align$NAME[index1] <- paste(gmap.align$Sid[index1],
gmap.align$Qid[index1], sep = "")
index2 <- gmap.align$Sid < gmap.align$Qid
gmap.align$NAME[index2] <- paste(gmap.align$Qid[index2],
gmap.align$Sid[index2], sep = "")
gmap.rep <- as.data.frame(table(gmap.align$NAME))
colnames(gmap.rep) <- c("id", "fre")
gmap.rep$id <- as.character(gmap.rep$id)
gmap.rep$fre <- as.integer(gmap.rep$fre)
one.name <- as.character(gmap.rep$id[which(gmap.rep$fre == 1)])
index.one <- which((gmap.align$NAME) %in% one.name)
dup.name <- as.character(gmap.rep$id[which(gmap.rep$fre > 1)])
index.need <- c(index.one)
if(length(dup.name)) {
for (i in 1:length(dup.name)) {
index.tem <- which(gmap.align$NAME == dup.name[i])
k <- index.tem[1]
for (j in 2:length(index.tem)) {
m <- index.tem[j]
if (gmap.align$Alinum[m] > gmap.align$Alinum[k]) {
k <- m
} else if (gmap.align$Alinum[m] == gmap.align$Alinum[k]) {
if (gmap.align$Coverage[m] > gmap.align$Coverage[k]) {
k <- m
}
}
}
index.need <- c(index.need, k)
}
gmap.result <- gmap.align[index.need, ]
} else{
gmap.result <- gmap.align
}
return(gmap.result)
}
Best, Wenbin
Hello,
I have some problems when I used function readGMAP, could you help me?
Best, Haimei Chen