dad70505 / sapelo-training

0 stars 0 forks source link

R Computational Statistical Analysis + COMMANDS #6

Open dad70505 opened 6 years ago

dad70505 commented 6 years ago

R Computational Statistical Analysis + COMMANDS

getwd() ## Gets the present working directory that you are in setwd("C:/Users/Daniel/Documents/edgeR") ## sets the working directory to be output in a certain area of your files counts<-read.table("C:/Users/Daniel/Documents/RSEM outputs/ep.counts.matrix") WRONG counts<-("C:/Users/Daniel/Desktop/RSEM outputs/ep.counts.matrix") WRONG head(counts) LABELED counts<-read.table("C:/Users/Daniel/Desktop/RSEM outputs/ep.counts.matrix") ## assigns the varible counts to read the matrix found from ep.counts.matrix in a table head(counts) ## gives the first 6 data counts of your matrix tail(counts) ## gives the last 6 data counts of your matrix dim(counts) ## gives you the number of rows by columns (for our matrix: rows= specific genes, columns:specific transcriptome) design<-read.csv("C:/Users/Daniel/Downloads/design/csv") WRONG design<-read.csv("C:/Users/Daniel/Downloads/design.csv") ## Imports the design.csv (comma-separated file) that was made in excel to classify the samples(i.e x10A, X10A, etc.) vs groups (i.e active.lo, active.eye,etc.) and it called design. design ## calls the variable design to show contents of the variable install.packages("edgeR") ## downloads the edgeR package? source("https://bioconductor.org/biocLite.R") ##?? biocLite("edgeR") ##?? groups<-design$Groups ## renames the column in the design.csv to just the name groups groups ## calls the variable groups to show the content of it library(edgeR) ##?? dge<-DGEList(counts, group=groups) ## sets variable dge to list the counts and their lib.size, and norm.factors help(DGEList) ## is a help page for using DGEList str(dge) ## States the parameters of the variable dge dge<-calcno WRONG
dge<-calcNormFactoes(dge) WRONG dge<-calcNormFactors(dge) ##?? str(dge) ls() ##lists contents of the present working directory head(counts) ##done class(dge) ## classifies the class (i.e matrix, data.frame) class(counts) ## DONE norm<-cpm(dge, normalize.lib.sizes=TRUE) ## normalizes the sequencing data to compare data and use in plots PLOTmds(norm) WRONG plotMDS(norm) ## makes a simple component graph des<-model.matrix(~groups) ##makes a matrix that assigns a 1 or 0 to the group that are similar to the intercept dge<-estimateDisp(dge, des) ## has a class called "DGEList" and the estimation of dispersion compared to the dge and des matrices et<-exactTest(dge, pair=c("active.eye", "inact.eye")) ## simulates a t-test in statistics and is used for comparative analysis tags<-topTags(et) ##? tags ## calls the variable tags to display the contents tags<-topTags(et, n=1000) ## this makes the number of genes that are shown to become 1000 head(tags) ## shows the first 6 genes in the tags variable write.table(file="tags_test.eyes.txt", tags) ## makes a table to output to a txt files that will be used to export into a excel file and analyze the genes and their significant differences between two libraries. tags<-topTags(et, n=1000) ## modifies the variable tags to have 1000 genes from the et test ......................................................................................................................................................+--++................................... etALO.IALO<-exactTest(dge, pair=c("active.lo", "inact.lo") ## etALO.IALO is the variable name that will compare the genes from the libraries of active.lo and inact.lo using the exact test tags<-topTags(et, n=1000) tags<-topTags(et, n=1000) tags<-topTags(etALO.IALO, n=1000) ls() etaloilo<-exactTest(dge, pair=c("active.lo", "inact.lo") etaloilo<-exactTest(dge, pair=c("active.lo", "inact.lo")) etaloilo<-exactTest(dge, pair=c("active.lo", "inact.lo")) tags<-topTags(etaloilo, n=1000) write.table(file="tags_test.lo.txt", tags) etaloae<-exactTest(dge,pair=c("active.lo", "inact.lo")) tags<-topTags(etaloae, n=1000) write.table(file="tags_test.elo.txt", tags)

x <- read.delim("TableOfCounts.txt",row.names="Symbol") group <- factor(c(1,1,2,2)) y <- DGEList(counts=x,group=group) y <- calcNormFactors(y) design <- model.matrix(~group) y <- estimateDisp(y,design)

The process to analyze the genes yielded:

  1. look into edgeR file on the desktop
  2. make sure the tags_test.*.txt is available
  3. open excel and open and browse under ALL FILES to find the specific file
  4. on A1 insert a cell that shifts to the right and have correct group head names
  5. Highlight/bold the columns that have an FDR value less than 0.05
  6. open new excel and place all these data points into there and manually analyze the specific genes that have shown a significant difference.

Run_blast.sh run_blast.zip

CLEANEST version of R comparison of Active vs. Inactive

getwd() setwd("C:/Users/Daniel/Desktop/Routputs") counts<-read.table("C:/Users/Daniel/Desktop/RSEM outputs/ep.counts.matrix") design<-read.csv("C:/Users/Daniel/Desktop/ModelsForGlm.csv") library("egdeR") ## this is install.packages("edgeR") source("https://bioconductor.org/biocLite.R") biocLite("edgeR") keeps<-c("X10A", "X10B", "X19A", "X19B", "X20A", "X20B", "X21A", "X21B", "X8A", "X8B", "X9A", "X9B") noshield<-counts[,colnames(counts) %in% keeps] keeps<-c("X10A", "X10B", "X19A", "X19B", "X20A", "X20B", "X21A", "X21B", "X8A", "X8B", "X9A", "X9B") noshield<-counts[,colnames(counts) %in% keeps] groups<-design$group groups dge<-DGEList(noshield, group=groups) str(dge) dge<-calcNormFactors(dge) str(dge) head(noshield) ##optional norm<-cpm(dge, normalize.lib.sizes=TRUE) plotMDS(norm) des<-model.matrix(~groups) dge<-estimateDisp(dge, des)

et<-exactTest(dge, pair=c("EA", "EI")) ## groups are EA, EI, LA, LI tags<-topTags(et) tags tags<-topTags(et, n=1000) write.table(file="testEA-EI.txt", tags) tags<-topTags(et, n=1000)

et2<-exactTest(dge, pair=c("LA", "LI")) ## groups are EA, EI, LA, LI tags<-topTags(et2) tags tags<-topTags(et2, n=1000) write.table(file="testLA-LI.txt", tags) tags<-topTags(et2, n=1000)

CONVERTING TO SVG

svg("Carnation1.svg")

FINDING MAX/MIN:

setwd("C:/Users/Daniel/Desktop/RSEM outputs/") data<-read.table("ep.TMM.EXPR.matrix")

design<-read.csv("groups.csv") gene1<-data["PPYR_10486", ] master<-as.data.frame(t(gene1)) master$tissue<-design$tissue master$active<-design$active colnames(master)<-c("expression", "tissue", "active") library(plyr) Warning message: package ‘plyr’ was built under R version 3.4.4 library(ggplot2) max(summary$mean) Error in summary$mean : object of type 'closure' is not subsettable summary<-ddply(master, c("tissue", "active"), summarise, mean=mean(expression), sd=sd(expression), sem=sd(expression)/sqrt(length(expression))) max(summary$mean) [1] 14.004

Do this for gene 2 and then for each one go back and add the + scale_y_continuous(limits=(0,24))

The testLO + Eyes.txt files that are not including the shield tissue and is a matrix

getwd() setwd("C:/Users/Daniel/Desktop/Routputs") counts<-read.table("C:/Users/Daniel/Desktop/RSEM outputs/ep.counts.matrix") design<-read.csv("C:/Users/Daniel/Desktop/ModelsForGlm.csv") library("egdeR") ## this is install.packages("edgeR") source("https://bioconductor.org/biocLite.R") biocLite("edgeR") keeps<-c("X10A", "X10B", "X19A", "X19B", "X20A", "X20B", "X21A", "X21B", "X8A", "X8B", "X9A", "X9B") noshield<-counts[,colnames(counts) %in% keeps] keeps<-c("X10A", "X10B", "X19A", "X19B", "X20A", "X20B", "X21A", "X21B", "X8A", "X8B", "X9A", "X9B") noshield<-counts[,colnames(counts) %in% keeps] groups<-design$group groups dge<-DGEList(noshield, group=groups) str(dge) dge<-calcNormFactors(dge) str(dge) head(noshield) ##optional norm<-cpm(dge, normalize.lib.sizes=TRUE) plotMDS(norm) des<-model.matrix(~groups) dge<-estimateDisp(dge, des)

et<-exactTest(dge, pair=c("EA", "EI")) ## groups are EA, EI, LA, LI tags<-topTags(et) tags tags<-topTags(et, n=1000) write.table(file="testEA-EI.txt", tags) tags<-topTags(et, n=1000)

et2<-exactTest(dge, pair=c("LA", "LI")) ## groups are EA, EI, LA, LI tags<-topTags(et2) tags tags<-topTags(et2, n=1000) write.table(file="testLA-LI.txt", tags) tags<-topTags(et2, n=1000)

save.image("C:\Users\Daniel\Desktop\Rwithnoshield")

CHANGING THE Y SCALE:

gene1<-data["PPYR_02799", ] master<-as.data.frame(t(gene1)) master$tissue<-design$tissue master$active<-design$active colnames(master)<-c("expression", "tissue", "active") library(plyr) library(ggplot2) summary<-ddply(master, c("tissue", "active"), summarise, mean=mean(expression), sd=sd(expression), sem=sd(expression)/sqrt(length(expression))) ggplot(summary, aes(x=tissue, y=mean, fill=as.factor(active)))+geom_bar(stat="identity", position=position_dodge())+geom_errorbar(aes(ymin=mean-sem, ymax=mean+sem), position=position_dodge(0.9), width=0.4)+ labs(title="PPYR_02799(Carnation2)") + theme(plot.title = element_text(hjust = 0.5)) +scale_y_continuous(limits=c(0,24)) Warning message: Removed 1 rows containing missing values (geom_errorbar). svg("Carnation2.svg") ggplot(summary, aes(x=tissue, y=mean, fill=as.factor(active)))+geom_bar(stat="identity", position=position_dodge())+geom_errorbar(aes(ymin=mean-sem, ymax=mean+sem), position=position_dodge(0.9), width=0.4)+ labs(title="PPYR_02799(Carnation2)") + theme(plot.title = element_text(hjust = 0.5)) +scale_y_continuous(limits=c(0,24)) Warning message: Removed 1 rows containing missing values (geom_errorbar). dev.off()