#################################### Normalizing the RNA-seq data and analyzing differentially expressed genes (DEGs) #################################### foldChange=2 padj=0.01 library("edgeR") rt1 <- read.table("sampleExp.txt", header=T, sep="\t", row.names=1, check.names=F) head(rt1) rt2 <- read.table("sample.txt", header=T, sep="\t") head(rt2) newdf1 <- as.data.frame(t(rt1)) newdf1[, "id"] <- rownames(newdf1) newdf2 <- rt2 newdf3 <- merge(newdf2, newdf1, by="id", all.x=TRUE) rt <- newdf3 rt[, 1:4] rownames(rt)=rt[,1] exp=t(rt[,4:ncol(rt)]) dimnames=list(rownames(exp),colnames(exp)) data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames) data=avereps(data) data=data[rowMeans(data)>1,] nrow(data) ncol(data) group=rt[, 2] block=rt[, 3] design <- model.matrix(~block+group) y <- DGEList(counts=data) y <- calcNormFactors(y) y <- estimateCommonDisp(y) y <- estimateTagwiseDisp(y) y <- estimateDisp(y,design) fit <- glmQLFit(y, design) qlf <- glmQLFTest(fit) et <- topTags(qlf, n=nrow(data)) nrow(et) ordered_tags <- et allDiff=ordered_tags$table allDiff=allDiff[is.na(allDiff$FDR)==FALSE,] diff=allDiff newData=y$pseudo.counts write.table(diff,file="edgerOut.xls",sep="\t",quote=F) diffSig = diff[(diff$FDR < padj & (diff$logFC>foldChange | diff$logFC<(-foldChange))),] write.table(diffSig, file="diffSig.xls",sep="\t",quote=F) diffUp = diff[(diff$FDR < padj & (diff$logFC>foldChange)),] write.table(diffUp, file="up.xls",sep="\t",quote=F) diffDown = diff[(diff$FDR < padj & (diff$logFC<(-foldChange))),] write.table(diffDown, file="down.xls",sep="\t",quote=F) normalizeExp=rbind(id=colnames(newData),newData) write.table(normalizeExp,file="normalizeExp.txt",sep="\t",quote=F,col.names=F) newData1 <- as.data.frame(t(newData)) newData1[, "id"] <- rownames(newData1) newData2 <- merge(rt[newData1[, "id"], 1:3], newData1, by="id", all.y=TRUE) write.table(newData2, "TCGA.txt", row.names=FALSE, sep="\t", quote=FALSE) diffExp=rbind(id=colnames(newData),newData[rownames(diffSig),]) write.table(diffExp,file="diffmRNAExp.txt",sep="\t",quote=F,col.names=F) heatmapData <- newData[rownames(diffSig),] #################################### Plotting heatmap #################################### data <- read.table("diffmRNAExp.txt", header=T, sep="\t", row.names=1, check.names=F) data <- as.matrix(log10(data+0.001)) data <- data[1:nrow(data),] library(pheatmap) annotation <- read.table("sample.txt",header=T, sep="\t", row.names=1) p <- pheatmap( data, annotation=annotation, color=colorRampPalette(c("navy", "white", "firebrick3"))(50), scale="row", clustering_distance_row="correlation", fontsize=9, fontsize_row=6, show_rownames=F, show_colnames=F) pdf("heatmap.pdf") p dev.off() #################################### Removing batch effects and normalizing the microarray data #################################### library(sva) library(limma) rt=read.table("merge.txt",sep="\t",header=T,check.names=F) rt=as.matrix(rt) rownames(rt)=rt[,1] exp=rt[,2:ncol(rt)] dimnames=list(rownames(exp),colnames(exp)) data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames) batchType=c(rep(1,155),rep(2,167)) modType=c(rep("normal",77),rep("tumor",78),rep("normal",52),rep("tumor",115)) mod = model.matrix(~as.factor(modType)) outTab=ComBat(data, batchType, mod, par.prior=TRUE) outTab=rbind(geneNames=colnames(outTab),outTab) write.table(outTab,file="normalize.txt",sep="\t",quote=F,col.names=F) #################################### Visualizing the ceRNA network #################################### df <- read.table("network.txt",sep = "\t",row.names = 1,header = T) head(df) library(ggalluvial) mycol <- rep(c("#223D6C","#D20A13","#FFD121","#088247","#11AA4D","#58CDD9","#7A142C","#5D90BA","#029149","#431A3D","#91612D","#6E568C","#E0367A","#D8D155","#64495D","#7CC767", "#223D6C","#D20A13","#FFD121","#088247","#11AA4D","#58CDD9","#7A142C","#5D90BA","#029149","#431A3D","#91612D","#6E568C","#E0367A","#D8D155","#64495D","#7CC767", "#223D6C","#D20A13","#FFD121","#088247","#11AA4D","#58CDD9","#7A142C","#5D90BA","#029149","#431A3D","#91612D","#223D6C","#D20A13","#FFD121","#088247","#11AA4D","#58CDD9","#7A142C","#5D90BA","#029149","#431A3D","#91612D","#6E568C","#E0367A","#D8D155","#64495D","#7CC767" ,"#223D6C","#D20A13","#FFD121","#088247","#11AA4D","#58CDD9","#7A142C","#5D90BA","#029149","#431A3D","#91612D","#6E568C","#E0367A","#D8D155","#64495D","#7CC767"),3) UCB_lodes <- to_lodes_form(df[,1:ncol(df)], axes = 1:ncol(df), id = "Cohort") dim(UCB_lodes) head(UCB_lodes) tail(UCB_lodes) ggplot(UCB_lodes, aes(x = x, stratum = stratum, alluvium = Cohort, fill = stratum, label = stratum)) + scale_x_discrete(expand = c(0, 0)) + geom_flow(width = 1/5) + geom_stratum(alpha = .9,width = 1/7) + geom_text(stat = "stratum", size = 1.5,color="black") + scale_fill_manual(values = mycol) + xlab("") + ylab("") + theme_bw() + theme(panel.grid =element_blank()) + theme(panel.border = element_blank()) + theme(axis.line = element_blank(),axis.ticks = element_blank(),axis.text = element_blank()) + ggtitle("")+ guides(fill = FALSE) ggsave("sankey3.pdf") #################################### Survival analysis #################################### options(stringsAsFactors=FALSE) library(survival) library(survminer) outTab=data.frame() picDir="picture" dir.create(picDir) library(survival) library(qvalue) rt=read.table("tumor.time.txt",header=T,sep="\t",row.names=1,check.names=F) rt[,"futime"]=rt[,"futime"]/365 rt[1:4,1:4] rt1=log2(rt[,3:ncol(rt)]+1) rt=cbind(rt[,1:2],rt1) rt[1:4,1:4] sur.cut <- surv_cutpoint(rt, time="futime", event="fustat", variables=colnames(rt[,3:ncol(rt)])) summary(sur.cut) labels <- paste(c(">", "<="), rep(summary(sur.cut)[,"cutpoint"], each=2), sep="") for(i in colnames(rt[,3:ncol(rt)])){ outfile <- paste("picture/cox_cutoff_", i, ".jpg", sep="") jpeg(outfile, width=20, height=20, units="cm", res=350) print(plot(sur.cut, i, palette="npg")) dev.off() outPdf=paste("picture/cox_cutoff", i,".pdf", sep="") pdf(file=outPdf, onefile=FALSE) print(plot(sur.cut, i, palette="npg")) dev.off() } oldnames <- colnames(rt) colnames(rt) <- gsub("-", ".", colnames(rt)) sur.cut <- surv_cutpoint(rt, time="futime", event="fustat", variables=colnames(rt[,3:ncol(rt)])) summary(sur.cut) rt <- surv_categorize(sur.cut, variables=colnames(rt[,3:ncol(rt)])) head(rt) colnames(rt) <- oldnames outlst <- list() for(i in colnames(rt[,3:ncol(rt)])){ cox <- coxph(Surv(futime, fustat) ~ rt[, i], data=rt) coxSummary = summary(cox) aoe <- cbind(as.data.frame(summary(cox)[["coefficients"]]),as.data.frame(summary(cox)[["conf.int"]])) abc <- t(aoe)[,1] coxP=coxSummary$coefficients[,"Pr(>|z|)"] rt1=rt[rt[, i] == "high",] rt2=rt[rt[, i] == "low",] n1=nrow(rt1) n2=nrow(rt2) surTab1=summary(survfit(Surv(futime, fustat) ~ 1, data = rt1)) surTab2=summary(survfit(Surv(futime, fustat) ~ 1, data = rt2)) medianTab1=surTab1$table medianTab2=surTab2$table model <- survdiff(Surv(futime, fustat) ~ rt[, i], data = rt) chisq <- model[["chisq"]] df <- length(model[["n"]]) - 1 pvalue <- pchisq(chisq, df, lower.tail=FALSE) label <- labels[rep(colnames(rt[,3:ncol(rt)]), each=2) == i] outlst[[i]] <- cbind( data.frame(var=i, level=c("high", "low"), label=label), rbind(medianTab1, medianTab2), pvalue_logrank=pvalue, hr=1/abc[6], lcl=1/abc[9], ucl=1/abc[8]) diff=survdiff(Surv(futime, fustat) ~ rt[,i],data = rt) fit <- survfit(Surv(futime, fustat) ~ rt[,i], data = rt) pValue=1-pchisq(diff$chisq, df=1) outTab=rbind(outTab,cbind(gene=i,coxSummary$coefficients,coxSummary$conf.int,KM=pValue, H_med=medianTab1["median"],H_0.95LCL=medianTab1["0.95LCL"],H_0.95UCL=medianTab1["0.95UCL"], L_med=medianTab2["median"],L_0.95LCL=medianTab2["0.95LCL"],L_0.95UCL=medianTab2["0.95UCL"])) pval=0 if(pValue<0.05){ pval=signif(pValue,4) pval=format(pval, scientific = TRUE) }else{ pval=round(pValue,3) } if(pValue<0.05){ HR <- paste("Hazard Ratio = ", round(1/abc[6],2), sep = "") CI <- paste("95% CI: ", paste(round(1/abc[9],2), round(1/abc[8],2), sep = " - "), sep = "") fig <- ggsurvplot(fit, data=rt, ncensor.plot = TRUE, censor = T, pval = paste(pval = ifelse(pvalue < 0.001, "p < 0.001", paste("p = ",round(pvalue,3), sep = "")), HR, CI, sep = "\n"), conf.int=TRUE, conf.int.style = "ribbon", legend.title=i, legend.labs=label, palette = c("#AC88FF", "#FC717F"), xlab = "Years after surgery (year) ",ylab = "Overall survival rate", ggtheme=theme( legend.key=element_rect(fill="white", colour=NA), panel.border=element_rect(fill="transparent", colour="black"), panel.background=element_rect(fill="white") )) geneName=unlist(strsplit(i,"\\|",))[1] tiffFile=paste(geneName,".survival.tiff",sep="") outTiff=paste(picDir,tiffFile,sep="\\") tiff(file=outTiff,width = 15,height = 15,units ="cm",compression="lzw",bg="white",res=600) print(fig) dev.off() outPdf=paste("picture/", geneName,".survival.pdf",sep="") pdf(file=outPdf, onefile=FALSE) print(fig) dev.off() } } outdf <- do.call(rbind, outlst) write.csv(outdf, "surv_median_logrank.csv", row.names=FALSE, na="")