## Run R script using $R CMD BATCH Mya_differential_gene_expression_analysis.R, best to run on high-spec workstation! ## Data file "vs_ex_11_matrix.counts_outliers_removed" available in Supplementary File 2_countmatrix ## Data description: # Sample ID Treatment Library Time point # Vs ex11 1.1C 1 DAY CONTROL LIB20596 1 # Vs ex11 1.3C 1 DAY CONTROL LIB20598 1 # Vs ex11 1.1D 1 DAY DAMAGED LIB20599 1 # Vs ex11 1.2D 1 DAY DAMAGED LIB20600 1 # Vs ex11 1.4D 1 DAY DAMAGED LIB20601 1 # Vs ex11 2.1C 3 DAYS CONTROL LIB20602 2 # Vs ex11 2.2C 3 DAYS CONTROL LIB20603 2 # Vs ex11 2.3C 3 DAYS CONTROL LIB20604 2 # Vs ex11 2.1D 3 DAYS DAMAGED LIB20605 2 # Vs ex11 2.2D 3 DAYS DAMAGED LIB20606 2 # Vs ex11 2.4D 3 DAYS DAMAGED LIB20607 2 # Vs ex11 3.1C 5 DAYS CONTROL LIB20608 3 # Vs ex11 3.4C 5 DAYS CONTROL LIB20610 3 # Vs ex11 3.1D 5 DAYS DAMAGED LIB20611 3 # Vs ex11 3.4D 5 DAYS DAMAGED LIB20613 3 # Vs ex11 4.1C 1 WEEK CONTROL LIB20614 4 # Vs ex11 4.2C 1 WEEK CONTROL LIB20615 4 # Vs ex11 4.3C 1 WEEK CONTROL LIB20616 4 # Vs ex11 4.1D 1 WEEK DAMAGED LIB20617 4 # Vs ex11 4.2D 1 WEEK DAMAGED LIB20618 4 # Vs ex11 4.3D 1 WEEK DAMAGED LIB20619 4 # Vs ex11 5.2C 2 WEEKS CONTROL LIB20620 5 # Vs ex11 5.3C 2 WEEKS CONTROL LIB20621 5 # Vs ex11 5.4C 2 WEEKS CONTROL LIB20622 5 # Vs ex11 5.2D 2 WEEKS DAMAGED LIB20623 5 # Vs ex11 5.3D 2 WEEKS DAMAGED LIB20624 5 # Vs ex11 5.4D 2 WEEKS DAMAGED LIB20625 5 library(edgeR) library(locfit) #READ IN THE DATA WHICH IS A FILE IN THE CURRENT WORKING DIRECTORY, IT IS RAW GENE COUNTS CALCULATED BY RSEM WITH 3 OUTLYING LIBRARIES REMOVED (LIB20597 [1 d control], LIB20608 [5 d control] + LIB20612 [5 d damaged]) mya_adult_rawdata <-read.delim("vs_ex_11_matrix.counts_outliers_removed", check.names=FALSE, stringsAsFactors=FALSE) #MAKE DGE LIST AND TELL IT THE NAMES OF THE COLLUMNS y <- DGEList(counts=mya_adult_rawdata[,2:28], genes=mya_adult_rawdata[,1]) #NORMALISE THE DATA y$samples$lib.size <- colSums(y$counts) y <- calcNormFactors(y) y$samples #FILTER THE DATA TO REMOVE VERY LOW COUNTS keep <- rowSums(cpm(y)>2) >=3 y <- y[keep, , keep.lib.sizes=FALSE] #RE-NORMALISE FOR NEW FILTERED LIBRARY (PROBABLY NEGLIGIBLE BUT GOOD PRACTISE) y$samples$lib.size <- colSums(y$counts) y <- calcNormFactors(y) #SET THE BASIC DESIGN, NB absolute outliers as per outlier analysis have been removed, but note time step 3 which = 5 days is still included. Although the replication of n=2 is now no longer enough to carry out statistical analysis on this time point, we have kept it in the dataframe to inform the estimate of disperion only Treatment <- factor(c("Control","Control","Damaged","Damaged","Damaged","Control","Control","Control","Damaged","Damaged","Damaged","Control","Control","Damaged","Damaged","Control","Control","Control","Damaged","Damaged","Damaged","Control","Control","Control","Damaged","Damaged","Damaged")) Time <- factor(c(1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5)) # SET THE DATAFRAME data.frame(Sample=colnames(y),Treatment,Time) #MAKE THE DATAFRAME A SET VARIABLE JUST SO CAN PRINT IT AT THE END DATAFRAME1 = data.frame(Sample=colnames(y),Treatment,Time) #Build Groups Collumn Group <- factor(paste(DATAFRAME1$Treatment,DATAFRAME1$Time,sep=".")) cbind(DATAFRAME1,Group=Group) #DEFINE THE MODEL DESIGN Design <- model.matrix(~0+Group, data=DATAFRAME1) colnames(Design) <- levels(Group) #ESTIMATE "ROBUST" DISPERSION RobustDesigny <- estimateGLMRobustDisp(y, Design) #PLOT DISPERSION pdf("Mya_filtered_Robust_BCV.pdf",width=5,height=5) plotBCV(RobustDesigny) dev.off() #FIT THE MODEL USING makeContrasts RobustFit <- glmFit(RobustDesigny, Design) my.contrasts <- makeContrasts(CvD = (((Damaged.1)+(Damaged.2)+(Damaged.4)+(Damaged.5))-((Control.1)+(Control.2)+(Control.4)+(Control.5))), Time1.CvsD = Damaged.1-Control.1, Time2.CvsD = Damaged.2-Control.2, Time4.CvsD = Damaged.4-Control.4, Time5.CvsD = Damaged.5-Control.5, levels=Design) #Response to damage, time 1 CvD_time1_lrt <- glmLRT(RobustFit, contrast=my.contrasts[,"Time1.CvsD"]) #Response to damage, time 2 CvD_time2_lrt <- glmLRT(RobustFit, contrast=my.contrasts[,"Time2.CvsD"]) #Response to damage, time 4 CvD_time4_lrt <- glmLRT(RobustFit, contrast=my.contrasts[,"Time4.CvsD"]) #Response to damage, time 5 CvD_time5_lrt <- glmLRT(RobustFit, contrast=my.contrasts[,"Time5.CvsD"]) #Time independant Genes CvD_time_independent_lrt <- glmLRT(RobustFit, contrast=my.contrasts[,"CvD"]) #Time dependant Genes Time_dependant_lrt <- glmLRT(RobustFit, contrast=my.contrasts[,c("Time1.CvsD","Time2.CvsD","Time4.CvsD","Time5.CvsD")]) #make top tags toptags_CvD_time1_lrt <- topTags(CvD_time1_lrt, n=NULL) toptags_CvD_time2_lrt <- topTags(CvD_time2_lrt, n=NULL) toptags_CvD_time4_lrt <- topTags(CvD_time4_lrt, n=NULL) toptags_CvD_time5_lrt <- topTags(CvD_time5_lrt, n=NULL) toptags_CvD_time_independent_lrt <- topTags(CvD_time_independent_lrt, n=NULL) toptags_Time_dependant_lrt <- topTags(Time_dependant_lrt, n=NULL) #Make to toptags files write.table(toptags_CvD_time1_lrt, file="mya_edgeR_CvD_time1", row.names=FALSE, col.names=TRUE) write.table(toptags_CvD_time2_lrt, file="mya_edgeR_CvD_time2", row.names=FALSE, col.names=TRUE) write.table(toptags_CvD_time4_lrt, file="mya_edgeR_CvD_time4", row.names=FALSE, col.names=TRUE) write.table(toptags_CvD_time5_lrt, file="mya_edgeR_CvD_time5", row.names=FALSE, col.names=TRUE) write.table(toptags_CvD_time_independent_lrt, file="mya_edgeR_CvD_time_independant", row.names=FALSE, col.names=TRUE) write.table(toptags_Time_dependant_lrt, file="mya_edgeR_CvD_time_dependant", row.names=FALSE, col.names=TRUE) #DGE object for plotting and summary #make the DE objects for all the comparisons CvD_time1_0.05 <- decideTestsDGE(CvD_time1_lrt, adjust.method="BH", p.value=0.05) CvD_time2_0.05 <- decideTestsDGE(CvD_time2_lrt, adjust.method="BH", p.value=0.05) CvD_time4_0.05 <- decideTestsDGE(CvD_time4_lrt, adjust.method="BH", p.value=0.05) CvD_time5_0.05 <- decideTestsDGE(CvD_time5_lrt, adjust.method="BH", p.value=0.05) CvD_0.05 <- decideTestsDGE(CvD_time_independent_lrt, adjust.method="BH", p.value=0.05) #print the summary for each DE object summary(CvD_time1_0.05) summary(CvD_time2_0.05) summary(CvD_time4_0.05) summary(CvD_time5_0.05) summary(CvD_0.05) #Make smear plots of control vs damaged for each time point pdf("1DAY_smear.pdf",width=5,height=5) plotSmear(CvD_time1_lrt, de.tags=detags_CvD_time1_0.05, pch=19, cex=0.15, panel.first=NULL) abline(h=c(-2,2), col="lightskyblue4", lty=5, lwd=1) dev.off() pdf("3DAY_smear.pdf",width=5,height=5) plotSmear(CvD_time2_lrt, de.tags=detags_CvD_time2_0.05, pch=19, cex=0.15, panel.first=NULL) abline(h=c(-2,2), col="lightskyblue4", lty=5, lwd=1) dev.off() pdf("7DAY_smear.pdf",width=5,height=5) plotSmear(CvD_time4_lrt, de.tags=detags_CvD_time4_0.05, pch=19, cex=0.15, panel.first=NULL) abline(h=c(-2,2), col="lightskyblue4", lty=5, lwd=1) dev.off() pdf("DAY14_smear.pdf",width=5,height=5) plotSmear(CvD_time5_lrt, de.tags=detags_CvD_time5_0.05, pch=19, cex=0.15, panel.first=NULL) abline(h=c(-2,2), col="lightskyblue4", lty=5, lwd=1) dev.off() #CHECK IT FINISHED BY MAKING THE DUMMY FILE write.table(my.contrasts, file="WORKED_yipee", row.names=FALSE, col.names=FALSE) save.image() y #