BMILAB / AStrap

AStrap: Identification of alternative splicing from transcript sequences without a reference genome
GNU General Public License v2.0
4 stars 4 forks source link

Issue with Input GMAP file #5

Open nikou123456 opened 4 years ago

nikou123456 commented 4 years ago

Hello,

I have some problems when I used function readGMAP, could you help me?

cluster.align <- readGMAP(gmap.path, raw.cluster, recluster = TRUE, recluster.identity = 0.7, recluster.coverage = 0.7)
Error in if (gmap.align$Alinum[m] > gmap.align$Alinum[k]) { :
  missing value where TRUE/FALSE needed

Best, Haimei Chen

BMILAB commented 4 years ago

Hello Haimei,

Thank you for your contacting. It seems that there has an issue with your input GMAP file with gff3 format. AStrap uses function "trimmGMA​P" to treat redundant sequence alignment and remove the invalid sequence alignments. You can run the code from "trimmGMA​P" (AStrap/R/AStrap.R) to figure out what's wrong with your GMAP document. You can also share your GMAP file to me if that 's work for you.

A typical GMAP file with gff3 looks like this:

##gff-version 3
# Generated by GMAP version 2016-06-09 using call: gmap.avx2 -D ./index -d index -t 10 --min-intronlength 9 -z sense_force --min-trimmed-coverage 0.7 --min-identity 0.7 -f 2 ./example_TRsequence.fasta
AMTR4 index gene 1 1028 . + . ID=AMTR4.path1;Name=AMTR4
AMTR4 index mRNA 1 1028 . + . ID=AMTR4.mrna1;Name=AMTR4;Parent=AMTR4.path1;coverage=100.0;identity=100.0;matches=1028;mismatches=0;indels=0;unknowns=0
AMTR4 index exon 1 1028 100 + . ID=AMTR4.mrna1.exon1;Name=AMTR4;Parent=AMTR4.mrna1;Target=AMTR4 1 1028 +
AMTR4 index CDS 166 690 100 + 0 ID=AMTR4.mrna1.cds1;Name=AMTR4;Parent=AMTR4.mrna1;Target=AMTR4 166 690 +

Should you have any further questions please do not hesitate to ask. Best, Wenbin Ye

Hello,

I have some problems when I used function readGMAP, could you help me?

cluster.align <- readGMAP(gmap.path, raw.cluster, recluster = TRUE, recluster.identity = 0.7, recluster.coverage = 0.7)
Error in if (gmap.align$Alinum[m] > gmap.align$Alinum[k]) { :
  missing value where TRUE/FALSE needed

Best, Haimei Chen

nikou123456 commented 4 years ago

Hello Wenbin,

Thanks a lot! I have fix the isssue when I used the gff3 file generated by GMAP version 2016-06-30 otherwise GMAP version 2020-06-30. The attachment file is my gff3 file generated by GMAP version 2020-06-30.

Best, Hiamei Chen

BMILAB commented 4 years ago

Hello Wenbin,

Thanks a lot! I have fix the isssue when I used the gff3 file generated by GMAP version 2016-06-30 otherwise GMAP version 2020-06-30. The attachment file is my gff3 file generated by GMAP version 2020-06-30.

Best, Hiamei Chen

Hi Hiamei Chen,

That's great to know the problem was caused by different GMAP version. Based on your GMAP file generated by 2020-06-30, I have modified related function, please see code below

#example redundant alignment
###
AMTR343 index   gene    186 1065    .   +   .   ID=AMTR1222.path2;Name=AMTR1222
AMTR343 index   mRNA    186 1065    .   +   .   ID=AMTR1222.mrna2;Name=AMTR1222;Parent=AMTR1222.path2;coverage=98.0;identity=99.9;matches=880;mismatches=0;indels=1;unknowns=0
AMTR343 index   exon    186 1065    99  +   .   ID=AMTR1222.mrna2.exon1;Name=AMTR1222;Parent=AMTR1222.mrna2;Target=AMTR1222 1 881 +
AMTR343 index   CDS 188 240 98  +   0   ID=AMTR1222.mrna2.cds1;Name=AMTR1222;Parent=AMTR1222.mrna2;Target=AMTR1222 3 56 +

###
AMTR1222    index   gene    1   881 .   +   .   ID=AMTR343.path2;Name=AMTR343
AMTR1222    index   mRNA    1   881 .   +   .   ID=AMTR343.mrna2;Name=AMTR343;Parent=AMTR343.path2;coverage=82.6;identity=99.9;matches=880;mismatches=0;indels=1;unknowns=0
AMTR1222    index   exon    1   881 99  +   .   ID=AMTR343.mrna2.exon1;Name=AMTR343;Parent=AMTR343.mrna2;Target=AMTR343 186 1065 +
AMTR1222    index   CDS 166 747 100 +   0   ID=AMTR343.mrna2.cds1;Name=AMTR343;Parent=AMTR343.mrna2;Target=AMTR343 350 931 +
###

Modified function trimmGMAP

gmap.file <- "./4CL_isoform_gmap_output_v2020.gff3"
trim.gmap <- trimmGMAP(gmap.file)
trimmGMAP <- function(gmap.file) {
  options(stringsAsFactors = F)
  gmap <- import.gff(gmap.file, format = "gff3",
                     feature.type = c("mRNA", "exon"))
  gmap <- as.data.frame(gmap)
  gmap$seqnames <- as.character(gmap$seqnames)
  gmap$type <- as.character(gmap$type)
  gmap$coverage <- as.numeric(gmap$coverage)
  gmap$identity <- as.numeric(gmap$identity)
  gmap$strand <- as.character(gmap$strand)
  gmap <- gmap[which(gmap$strand == "+"), ]
  gmap <- gmap[which(gmap$seqnames != gmap$Name), ]
  mRNA.id <- which(gmap$type == "mRNA")
  align <- gmap[mRNA.id, ]
  align$num <- 0
  align$Subject <- ""
  align$Query <- ""
  for (i in 1:length(mRNA.id)) {
    if (i == length(mRNA.id)) {
      end.idx <- nrow(gmap)
    } else {
      end.idx <- mRNA.id[i + 1] - 1
    }
    exon.num <- end.idx - mRNA.id[i]
    align$num[i] <- exon.num
    for (j in (mRNA.id[i] + 1):end.idx) {
      start <- str_extract(gmap$Target[j], "(?<=\\s)\\d+(?=\\s\\d+)")
      end <- str_extract(gmap$Target[j], "(?<=\\s)\\d+(?=\\s(\\+|\\.|\\-))")
      Query.align <- paste(start, end, sep = "-")
      align$Query[i] <- paste(align$Query[i], Query.align, sep = ":")

      Subject.align <- paste(gmap$start[j], gmap$end[j], sep = "-")
      align$Subject[i] <- paste(align$Subject[i], Subject.align, sep = ":")
    }
  }
  gmap.align <- data.frame(Sid = as.character(align$seqnames), Qid = align$Name,
                           Coverage = align$coverage, identity = align$identity,
                           Alinum = align$num, Salign = align$Subject,
                           Qalign = align$Query)
  gmap.align$NAME <- ""
  index1 <- gmap.align$Sid > gmap.align$Qid
  gmap.align$NAME[index1] <- paste(gmap.align$Sid[index1],
                                   gmap.align$Qid[index1], sep = "")
  index2 <- gmap.align$Sid < gmap.align$Qid
  gmap.align$NAME[index2] <- paste(gmap.align$Qid[index2],
                                   gmap.align$Sid[index2], sep = "")

  gmap.rep <- as.data.frame(table(gmap.align$NAME))
  colnames(gmap.rep) <- c("id", "fre")
  gmap.rep$id <- as.character(gmap.rep$id)
  gmap.rep$fre <- as.integer(gmap.rep$fre)
  one.name <- as.character(gmap.rep$id[which(gmap.rep$fre == 1)])
  index.one <- which((gmap.align$NAME) %in% one.name)
  dup.name <- as.character(gmap.rep$id[which(gmap.rep$fre > 1)])
  index.need <- c(index.one)

if(length(dup.name)) {
    for (i in 1:length(dup.name)) {
      index.tem <- which(gmap.align$NAME == dup.name[i])
      k <- index.tem[1]
      for (j in 2:length(index.tem)) {
        m <- index.tem[j]
        if (gmap.align$Alinum[m] > gmap.align$Alinum[k]) {
          k <- m
        } else if (gmap.align$Alinum[m] == gmap.align$Alinum[k]) {
          if (gmap.align$Coverage[m] > gmap.align$Coverage[k]) {
            k <- m
          }
        }
      }
      index.need <- c(index.need, k)
    }
    gmap.result <- gmap.align[index.need, ]
  } else{
    gmap.result <- gmap.align
  }

  return(gmap.result)
}

Best, Wenbin