setwd('/Users/guywachsman/Dropbox (Duke Bio_Ea)/RNA_seq/edgeR/oscillation/temporal') library("edgeR") library("reshape2") library("grid") library("stringr") library("ggplot2") library(ggbiplot) #first install and load devtools package and then install_github("vqv/ggbiplot") library(ggfortify) z=read.delim("counts_temporal.txt") colnames(z)[3:length(z)]=c(paste(rep.int("BP", 5), rep(1:5, 1), sep=""), paste(rep.int("P", 5), rep(1:5, 1), sep=""), paste(rep.int("AP", 5), rep(1:5, 1), sep="")) z[z[,1]=="AT1G07850",] # #keeping genes w/ reasonable expression # keep=rowSums(cpm(z[3:17])>2) >= 3 # z=z[keep,] # names(z) # z1=z grp <- factor(substr(colnames(z)[3:length(z)], 1, 1)) grp=factor(grp, levels=unique(grp)) w <- DGEList(counts=z[,3:length(z)], genes=z[,1:2], group=grp) dgn <- model.matrix(~0+grp, data=w$samples) colnames(dgn) <- levels(w$samples$group) w <- estimateGLMCommonDisp(w, dgn, verbose=TRUE) w <- estimateGLMTrendedDisp(w, dgn)##put?? w <- estimateGLMTagwiseDisp(w, dgn) w=calcNormFactors(w, method="TMM") keep=rowSums(cpm(w)>2)>=3 w=w[keep, ] ft <- glmFit(w, dgn) lrt1 <- glmLRT(ft) w.cpm=data.frame(w$genes, cpm(w)) #DE analysis #glm for all combination of root sections lrt.B_P <- glmLRT(ft, contrast=c(-1,1,0)) lrt.P_A <- glmLRT(ft, contrast=c(0,1,-1)) lrt.B_A <- glmLRT(ft, contrast=c(1,0,-1)) #toptags tt.B_P <-topTags(lrt.B_P, n=33603) tt.P_A <-topTags(lrt.P_A, n=33603) tt.B_A <-topTags(lrt.B_A, n=33603) tt.B_P$table[tt.B_P$table[,1]=="AT3G47670",] tt.P_A $table[tt.P_A$table[,1]=="DR5",] #selecting w/ logFC threshold logFC=0.5 #positive logFC b_P=tt.B_P[which(tt.B_P$table$logFC > logFC),] P_a=tt.P_A[which(tt.P_A$table$logFC > logFC),] dim(P_a) B_a=tt.B_A[which(tt.B_A$table$logFC > logFC),] bP=b_P$table[,1] dim(b_P) #negative logFC B_p=tt.B_P[which(tt.B_P$table$logFC < -logFC),] dim(B_p) p_A=tt.P_A[which(tt.P_A$table$logFC < -logFC),] dim(p_A) b_A=tt.B_A[which(tt.B_A$table$logFC < -logFC),] Bp=B_p$table[,1] #rPs_bP=Reduce(intersect, list(b_P$table[,1], rPs)) #intersect the different regions bPa=Reduce(intersect, list(b_P$table[,1], P_a$table[,1]))#P high vs b & a length(bPa) BpA=Reduce(intersect, list(B_p$table[,1], p_A$table[,1]))#p low vs B & A length(BpA) ##########writing tables tair10=read.delim("/Users/guywachsman/Dropbox (Duke Bio_Ea)/RNA_seq/edgeR/oscillation/databases/TAIR10_functional_descriptions.txt", header=T, sep="\t", quote="") tair10.ND=tair10[!duplicated(substring(tair10[,1], 1, 9)),] t10.ND=data.frame(substring(tair10.ND[,1], 1, 9), tair10.ND[,2:5]) colnames(t10.ND)=colnames(tair10) t10ND.bP=t10.ND[t10.ND[,1] %in% bP,] write.table(t10ND.bP, "text/At_bP_logFC=0.5.txt", sep="\t", row.names=F) length(bP) ################# #__________________________________________________ #ploting the different groups #__________________________________________________ w.cpm.mean=data.frame(w.cpm[,1:2], bp=apply(w.cpm[,3:7], 1, mean), p=apply(w.cpm[,8:12], 1, mean), ap=apply(w.cpm[,13:17], 1, mean)) w.cpm.mean.bP=w.cpm.mean[w.cpm.mean[,1] %in% bP,] w.cpm.mean.bP.m=melt(w.cpm.mean.bP, id.vars=c("At_num", "gene_name")) dim(w.cpm.mean.bP) w.cpm.mean.bP[w.cpm.mean.bP$ap>1000,] # gns=c("AT1G13980", "AT5G13300", "AT1G02100") # y.cpm.mean.bP.m.g=p[p[,1] %in% gns,] ggplot(w.cpm.mean.bP.m, aes(x=variable, y=value))+geom_line(data= w.cpm.mean.bP.m, color="cornflowerblue", aes(x=variable, y=value, group=At_num))+stat_summary(fun.y="mean", geom="line", colour="red", aes(group=1))+labs(x="time point", y="CPM", colour = "")+theme(axis.text.x = element_text(size=28), axis.text.y = element_text(size=20), plot.title=element_text(size=20, vjust=3), axis.title = element_text(size=32), strip.text.x=element_text(size=24), plot.background=element_rect(fill=NA, colour=NA))+scale_x_discrete(expand=c(0.02,0.02)) #+geom_line(data= y.cpm.mean.Pall.h.m.g, colour=c(rep("green", 5), rep("blue", 5), rep("black", 5)), aes(x=variable, y=value, group=At_num)) setwd('/Users/guywachsman/Dropbox (Duke Bio_Ea)/RNA_seq/edgeR/oscillation/temporal/plots/') ggsave(file="bP_all_genes_dynamic.pdf") _____________________________________________________ ## selecting w/ logFC & FDR thresholds # logFC=2 # PValue=0.2 # #positive logFC # b_P=tt.B_P[which(tt.B_P$table$logFC > logFC & tt.B_P$table$PValue < PValue),] # head(b_P) # P_a=tt.P_A[which(tt.P_A$table$logFC > logFC & tt.P_A$table$PValue < PValue),] # dim(P_a) # B_a=tt.B_A[which(tt.B_A$table$logFC > logFC & tt.B_A$table$PValue < PValue),] # #negative logFC # B_p=tt.B_P[which(tt.B_P$table$logFC < -logFC & tt.B_P$table$PValue < PValue),] # dim(B_p) # p_A=tt.P_A[which(tt.P_A$table$logFC < -logFC & tt.P_A$table$PValue < PValue),] # dim(p_A) # b_A=tt.B_A[which(tt.B_A$table$logFC < -logFC & tt.B_A$table$PValue < PValue),] #rPs_bP=Reduce(intersect, list(b_P$table[,1], rPs)) #intersect the different regions bPa=Reduce(intersect, list(b_P$table[,1], P_a$table[,1]))#P high vs b & a length(bPa) BpA=Reduce(intersect, list(B_p$table[,1], p_A$table[,1]))#p low vs B & A length(BpA) #DEGs in different regions in reads space (x) z1.bPa =z1[z1[,1] %in% bPa,] head(z1.bPa) z1.BpA =z1[z1[,1] %in% BpA,] dim(z1.BpA) w.cpm=cpm(w) w.cpm1=cbind(z[,1:2], w.cpm) w.cpm2=w.cpm1[which(rowMeans(w.cpm1[,8:12])>3),] head(w.cpm2) b_P.cpm=w.cpm1[w.cpm1[,1] %in% b_P$table[,1],] b_P.cpm.sub=b_P.cpm[which(rowMeans(b_P.cpm[,8:12])>3),]#mean of P is >3 head(b_P.cpm.sub) tair10.ND=tair10[!duplicated(substring(tair10[,1], 1, 9)),] dim(tair10.ND) b_P.cpm.sub.tair10.ND=tair10.ND[substring(tair10.ND[,1], 1, 9) %in% b_P.cpm.sub[,1],] dim(b_P.cpm.sub.tair10.ND) #b_P.cpm.sub.tair10.ND=data.frame(b_P.cpm.sub[,1], b_P.cpm.sub.tair10.ND[,2:5]) colnames(b_P.cpm.sub.tair10.ND)=colnames(tair10.ND) write.table(b_P.cpm.sub.tair10.ND, "P_high_b_low_short_TAIR10_logFC>2; pvalue<0.2;meanP>3.txt", sep="\t", row.names=F) tair10_z1.bPa=tair10[substring(tair10[,1], 1, 9) %in% bPa,] write.table(tair10_z1.bPa, "bPa_with_Phase_logFC>1.5; pvalue<0.1.txt", sep="\t", row.names=F) ######################################### #plotting multiple/single genes###good!!! ######################################### setwd('/Users/guywachsman/Dropbox (Duke Bio_Ea)/RNA_seq/edgeR/oscillation/temporal/plots/') w.cpm=cpm(w) w.cpm=data.frame(w$genes, w.cpm) #rownames(w.cpm1)= w.cpm1[,1] #w.cpm1=subset(w.cpm1, select=-At_num) head(w.cpm) w.cpm[w.cpm[,1]=="AT4G13195",] #looking at the CPM of specific gene my_genes=c("DR5") #genes=m.genes[,1] GOIs=as.vector(my_genes) #y.cor.cpm[y.cor.cpm[,1]=='AT5G10570',] n=length(my_genes) z.axis=factor(as.character(c(rep.int("bp", n*5), rep.int("p", n*5), rep.int("ap", n*5)))) z.axis=factor(z.axis, levels=unique(z.axis)) w.cpm.melt=data.frame(melt(w.cpm[match(GOIs, w.cpm[,1]),]), z.axis) w.cpm.melt=w.cpm.melt[complete.cases(w.cpm.melt),] w.cpm.melt$gene_name=factor(w.cpm.melt$gene_name, levels=unique(w.cpm.melt$gene_name)) w.cpm.melt$At_num=factor(w.cpm.melt$At_num, levels=unique(w.cpm.melt$At_num)) #sd=as_labeller(c(AT5G04140='GLU1/Fd-GOGAT', AT2G41220='GLU2/Fd-GOGAT', AT5G53460='NADH_dependent/GLT1')) w.cpm.melt=w.cpm.melt[complete.cases(w.cpm.melt),] facets1=as.factor(ifelse(as.character(w.cpm.melt$gene_name)!=as.character(w.cpm.melt$At_num), paste(w.cpm.melt$gene_name, w.cpm.melt$At_num, sep="-"), paste("", w.cpm.melt$At_num, sep=""))) facets1=factor(facets1, levels=unique(facets1)) levels(w.cpm.melt$At_num) <- levels(facets1) x.size=32 y.size=32 st.size=18 title="" #x11() ggplot(w.cpm.melt, aes(x=z.axis, y=value))+geom_boxplot(linetype="dashed", outlier.shape = NA)+stat_boxplot(aes(ymin = ..lower.., ymax = ..upper..), outlier.shape = NA)+stat_boxplot(geom = "errorbar", width=0.2, aes(ymin = ..ymax..))+stat_boxplot(geom = "errorbar", width=0.2, aes(ymax = ..ymin..))+facet_wrap(~At_num, scales='free')+labs(title=title, x="time point", y="CPM", colour = "")+theme(axis.text.x = element_text(size=x.size), axis.text.y = element_text(size=y.size), plot.title=element_text(size=20, hjust = 0.5), axis.title = element_text(size=18), strip.text.x=element_text(size=st.size-2), plot.background=element_rect(fill=NA, colour=NA)) setwd('/Users/guywachsman/Dropbox (Duke Bio_Ea)/RNA_seq/edgeR/oscillation/temporal/plots/') ggsave(file="DR5-dash_temporal.pdf") dev.off() head(w.cpm1) quartz.save(file=title, type="pdf", device=dev.cur()) #PCA analysis #DESeq2: heatmapheatmap+PCA library("DESeq2") library("gplots") library("ggrepel") setwd("/Users/guywachsman/Dropbox (Duke Bio_Ea)/RNA_seq/edgeR/oscillation/temporal/") x1=read.delim("counts_temporal.txt", header=T, row.names=1) x2=subset(x1, select=-gene_name) samples <- data.frame(row.names=c(colnames(x2)), condition=as.factor(c(rep("BP",5), rep("P",5), rep("AP",5)))) samples$condition=factor(samples$condition, levels=unique(samples$condition)) se=DESeqDataSetFromMatrix(countData=x2, colData=samples, design=~condition) dds <- DESeq(se) res <- results(dds) resOrdered <- res[order(res$padj),] rld <- rlog(dds) #head(assay(rld)) str(rld) pca=plotPCA(rld, returnData=TRUE) colnames(pca)=c("PC1", "PC2", "group", "section", "name") percentVar=round(100 * attr(pca, "percentVar")) pca$name=colnames(z[,3:17])#= rownames(pca)=colnames(x[,3:17]) pca$points=c(paste0(rep("bp", 5), seq(1, 5, 1)), paste0(rep("p", 5), seq(1, 5, 1)), paste0(rep("ap", 5), seq(1, 5, 1))) ggplot(pca, aes(PC1, PC2, color=section)) + geom_point(size=3, alpha = 0.5) +xlab(paste0("PC1: ",percentVar[1],"% variance"))+ylab(paste0("PC2: ",percentVar[2],"% variance"))+geom_text_repel(data=pca, aes(x=PC1, y=PC2, label=points), size=10)+theme(axis.text= element_text(size=24), axis.title=element_text(size=24), plot.background=element_rect(fill=NA, colour=NA), legend.position="") setwd("/Users/guywachsman/Dropbox (Duke Bio_Ea)/RNA_seq/edgeR/oscillation/temporal/plots") #ggsave("pca_clean.pdf") getwd() vsd <- varianceStabilizingTransformation(dds) distsRL1 <- dist(t(assay(rld))) mat <- as.matrix(distsRL1) rownames(mat) <- colnames(mat) <- c(paste(rep.int("BP", 5), rep(1:5, 1), sep=""), paste(rep.int("P", 5), rep(1:5, 1), sep=""), paste(rep.int("AP", 5), rep(1:5, 1), sep="")) my_palette <- colorRampPalette(c("green", "black", "red"))(n = 1000) heatmap.2(mat, cexRow=0.7,cexCol=0.7, trace="none",srtCol=45, density.info='none', main="Distance heatmap", col=my_palette, key.title=NA, key.xlab="Distance", margins=c(6,6), key.xtickfun=function() { cex <- par("cex")*par("cex.axis") side <- 1 line <- 0 col <- par("col.axis") font <- par("font.axis") mtext("low", side=side, at=0, adj=0, line=line, cex=cex, col=col, font=font) mtext("high", side=side, at=1, adj=1, line=line, cex=cex, col=col, font=font) return(list(labels=FALSE, tick=FALSE)) }, key.par=list(mgp=c(0.8, 0.5, 0), pin=c(1.5,0.3))) quartz.save(file="heatmap.pdf", type="pdf", device=dev.cur()) plotPCA(vsd, intgroup="condition", col=cols, ntop=20000) quartz.save(file="PCA_DESeq2.jpeg", type="jpeg", device=dev.cur()) plotDispEsts(dds) quartz.save(file="disperssion_DESeq2.jpeg", type="jpeg", device=dev.cur()) ###MDS analysis-option2 sampleDists=dist( t( assay(rld) ) ) sampleDistMatrix=as.matrix( sampleDists ) mdsData <- data.frame(cmdscale(sampleDistMatrix)) mds <- data.frame(mdsData, as.data.frame(colData(rld)), pca$points) colnames(mds)=c("X1", "X2", "condition", "sizeFactor", "points") ggplot(mds, aes(X1,X2,color=condition)) + geom_point(size=5, alpha=0.5)+geom_text_repel(data=mds, aes(x=X1, y=X2, label=points), size=10)+xlab("dimension-1")+ylab("dimension-2")+theme(legend.position="", axis.text=element_text(size=x.size), axis.title = element_text(size=24)) #ggsave("./plots/mds_ggplot_temporal.pdf") #data tables write.table(x.RpS, "p_low_R&S_high_logFC>1.5;pvalue<0.1.txt", sep="\t", row.names=F) tair10=read.delim("../TAIR10_functional_descriptions.txt", header=T, sep="\t", quote="") tair10_in_b_P = tair10[substring(tair10[,1], 1, 9) %in% b_P$table[,1],] head(tair10_in_b_P.s.s) write.table(tair10_in_x.RpS, "p_low_R&S_high_TAIR10_logFC>1.5; pvalue<0.1.txt", sep="\t", row.names=F) #TAIR w/o variants tair10_in_b_P.s=cbind(substring(tair10_in_b_P[,1], 1, 9), tair10_in_b_P[,2:5]) colnames(tair10_in_b_P.s)=colnames(tair10_in_b_P) tair10_in_b_P.s.s= tair10_in_b_P.s[!duplicated(tair10_in_b_P.s[,1]),] write.table(tair10_in_b_P.s.s, "P_high_b_low_short_TAIR10_logFC>1.5; pvalue<0.1.txt", sep="\t", row.names=F) #biplot library(ggbiplot) library(ggfortify) w.cpm.biplot=data.frame(w.cpm) head(w.cor.cpm) w.cor.cpm.mean=data.frame(BP=apply(w.cor.cpm[,3:7], 1, mean), P=apply(w.cor.cpm[,8:12], 1, mean), AP=apply(w.cor.cpm[,13:17], 1, mean), row.names=w.cor.cpm[,1]) head(w.cor.cpm.mean) w.cor.cpm.mean.pca=prcomp(w.cor.cpm.mean, scale=F) #good dimentions par(lwd=3, cex=1.3,mar=c(5, 1, 3, 0)) biplot(w.cor.cpm.mean.pca, scale=0, col=c("black", "royalblue"), xlabs=rep("ยท", nrow(w.cor.cpm.mean)), expand=1, xlim=c(-130, 200), ylim=c(-190, 150)) setwd("/Users/guywachsman/Dropbox (Duke Bio_Ea)/RNA_seq/edgeR/oscillation/temporal/plots") quartz.save(file="biplot_temporal_dots.pdf", type="psd", device=dev.cur()) dev.off()