# install development version of genetable package
# devtools::install_github("UW-GAC/genetable", ref = "develop")
library(genetable)
library(tidyverse)
path <- "/projects/topmed/downloaded_data/Gencode/v19/gencode.v19.annotation.gtf.gz"
# import the gtf file to a tidy data frame (a tibble)
# this is slow
gtf <- import_gencode(path)
# look at the tibble
glimpse(gtf)
# summarize the number of features by tag.
summarize_tag(gtf, tag = "basic")
# filter gtf file to return transcript features tagged basic
basic_transcripts <- filter_gencode(gtf, featurearg = "transcript", tagarg = "basic")
# or filter for features == "gene"
genes <- filter_gencode(gtf, featurearg = "gene")
# define the boundaries of the feature of interest
# this is slow
#gene_bounds <- define_boundaries(basic_transcripts, "gene_id")
gene_bounds <- define_boundaries(genes, "gene_id")
# can check the resulting tibble for sanity
glimpse(gene_bounds)
# save to file
note <- 'This file includes starting and ending ranges for feature = "gene" in the gtf file.'
save_to_file(gene_bounds, notes = note) # will automatically make file called feature_bounds_DATE.tsv