library(tidyverse) library(stringr) library(edgeR) library(limma) library(org.Hs.eg.db) library(clusterProfiler) library(pheatmap) library(survival) library(survminer) library(DESeq2) library(glmnet) library(survival) library(regplot) library(rms) library(survivalROC) library(DESeq2) library(org.Hs.eg.db) library(limma) library(edgeR) library(MASS) library(stringr) library(cowplot) library(ranger) ##data clean big <- theme(axis.title.x=element_text(size = 20), axis.title.y=element_text(size = 20), axis.text.x=element_text(size = 20,color = "black"), axis.text.y=element_text(size = 20,color = "black")) small <- theme(axis.title.x=element_text(size = 15), axis.title.y=element_text(size = 15), axis.text.x=element_text(size = 10,color = "black"), axis.text.y=element_text(size = 15,color = "black")) clinic <- read.csv('luad_early_clinic.csv', row.names = 1) immunity <- read.csv('xCell_LUAD_24 immune cells.csv', row.names = 1, header = TRUE) immunity <- data.frame(t(immunity)) rownames(immunity) <- str_replace_all(rownames(immunity), '[.]', '-') immunity <- immunity[rownames(immunity) %in% rownames(clinic), ] clinic <- clinic[rownames(clinic) %in% rownames(immunity), ] immunity <- immunity[match(rownames(clinic), rownames(immunity)), ] all(rownames(immunity) == rownames(clinic)) immunity_cox <- merge(clinic[,c(8, 9)], immunity, by.x = 0, by.y = 0) names(immunity_cox)[1] <- 'sampleID' res.cut <- surv_cutpoint(immunity_cox, time = "Time", event = "Status", variables = names(immunity_cox)[4:31], minprop = 0.2) res.cat <- surv_categorize(res.cut) res <- data.frame() my.surv <- Surv(res.cat$Time, res.cat$Status) for (i in colnames(res.cat)[3:length(colnames(res.cat))]) { res.cat[[i]] <- factor(res.cat[[i]], levels = c('low', 'high')) group <- res.cat[,i] survival_dat <- data.frame(group=group) fit <- survfit(my.surv ~ group) m=coxph(my.surv ~ group, data = survival_dat) aoe <- cbind(as.data.frame(summary(m)[["coefficients"]]),as.data.frame(summary(m)[["conf.int"]])) abc <- t(aoe)[,1] HR <- paste("Hazard Ratio = ", round(abc[6],2), sep = "") CI <- paste("95% CI: ", paste(round(abc[8],2), round(abc[9],2), sep = " - "), sep = "") if (abc[5] < 0.05){ res_now <- data.frame(symbol = names(res.cat[i]), pvalue = abc[5], HR = HR, CI = CI) res <- rbind(res, res_now) } res$symbol <- as.character(res$symbol) } res <- res[order(res$pvalue), ] res_population <- res tem_clinic <- clinic clinic <- merge(tem_clinic, immunity_cox[,c(1, 26)], by.x = 0, by.y = 1) names(clinic)[1] <- 'sampleID' ##WGCNA rsem <- read.csv('LUAD_RNAseq.csv', row.names = 1) names(rsem) <- str_replace_all(names(rsem), '[.]','-') rsem <- rsem[,names(rsem) %in% clinic$sampleID] immune_gene <- read.csv('InnateDB_genes.csv') rsem_immunity <- rsem[rownames(rsem) %in% immune_gene$name, ] rsem_immunity <- rsem_immunity[, match(clinic$sampleID, names(rsem_immunity))] all(names(rsem_immunity) == clinic$sampleID) wgcna_seq <- rsem_immunity traitData <- clinic[,c(1, 11)] low_count_mask <- rowSums(wgcna_seq) < ncol(wgcna_seq) raw_counts_filter <- wgcna_seq[which(low_count_mask==FALSE),] datExpr0 = as.data.frame(t(raw_counts_filter)) gsg = goodSamplesGenes(datExpr0, verbose = 3) datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes] sampleTree = hclust(dist(datExpr0), method = "average") sizeGrWindow(12,9) par(cex = 0.6); par(mar = c(0,4,2,0)) plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2) abline(h = 200, col = "red") clust = cutreeStatic(sampleTree, cutHeight = 250, minSize = 10) table(clust) keepSamples = (clust==1) datExpr = datExpr0[keepSamples, ] nGenes = ncol(datExpr) nSamples = nrow(datExpr) microSample = rownames(datExpr) length(microSample) nrow(traitData) datTraits = traitData[traitData$sampleID %in% microSample, ] datExpr <- datExpr[row.names(datExpr) %in% datTraits$sampleID, ] nrow(datExpr) head(datTraits) datExpr[1:3, 1:10] powers = c(c(1:10), seq(from = 12, to=20, by=2)) sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5) sizeGrWindow(9, 5) par(mfrow = c(1,2)); cex1 = 0.9; pdf('wgcna_sft_selection.pdf') plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence")); text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red"); abline(h=0.80,col="red") plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity")) text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red") dev.off() net = blockwiseModules(datExpr, power = 3, TOMType = "signed", minModuleSize = 50, reassignThreshold = 0, mergeCutHeight = 0.25, numericLabels = TRUE, pamRespectsDendro = FALSE, verbose = 3, maxBlockSize = 20000) mergedColors = labels2colors(net$colors) mergedColors = labels2colors(net$colors) pdf('wgcna_module.pdf') plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]], "Module colors", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) dev.off() moduleLabels = net$colors moduleColors = labels2colors(net$colors) MEs = net$MEs; geneTree = net$dendrograms[[1]]; nGenes = ncol(datExpr); nSamples = nrow(datExpr); table(net$colors) sizeGrWindow(12, 9) MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes MEs = orderMEs(MEs0) datTraits = traitData[traitData$sampleID %in% microSample,] index <-datTraits$sampleID datTraits <- data.frame(Mast.cells = datTraits$Mast.cells) rownames(datTraits) <- index moduleTraitCor = cor(MEs, datTraits, use = "p"); moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples); sizeGrWindow(10,6) textMatrix = paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = ""); dim(textMatrix) = dim(moduleTraitCor) pdf('wgcna_heatmap.pdf', width = 4, height = 7) par(mar = c(6, 10, 2, 2)); labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = names(MEs), ySymbols = names(MEs), colorLabels = FALSE, colors = greenWhiteRed(50), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 1, zlim = c(-1,1), main = paste("Module-trait relationships")) dev.off() modNames = substring(names(MEs), 3) geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p")); MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples)); names(geneModuleMembership) = paste("MM", modNames, sep=""); names(MMPvalue) = paste("p.MM", modNames, sep="") Mast.cells = as.data.frame(datTraits$Mast.cells); names(Mast.cells) = "Mast.cells" row.names(Mast.cells)<-row.names(datTraits) geneTraitSignificance = as.data.frame(cor(datExpr, Mast.cells, use = "p")); GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples)); names(geneTraitSignificance) = paste("GS.", names(Mast.cells), sep=""); MET = orderMEs(cbind(MEs, Mast.cells)) names(GSPvalue) = paste("p.GS.", names(Mast.cells), sep=""); MET = orderMEs(cbind(MEs, Mast.cells)) module = c("yellow") column = match(module, modNames); moduleGenes = moduleColors==module; yellow_column<-geneModuleMembership[, column] names(yellow_column) <- row.names(geneModuleMembership) geneModuleMembership[rownames(geneModuleMembership) == 'IL1B', ] yellow_MN <- yellow_column[moduleGenes] GS.mir<-geneTraitSignificance[, 1] names(GS.mir)<-row.names(geneTraitSignificance) GS.mir_yellow<-GS.mir[moduleGenes] yellow_module<-cbind(yellow_MN,GS.mir_yellow) yellow_module<-yellow_module[order(yellow_module[,2],decreasing=TRUE),] yellow_module <- data.frame(yellow_module) yellmatrix <- rsem_immunity[row.names(rsem_immunity) %in% row.names(yellow_module), ] all(names(yellmatrix) == clinic$sampleID) surv_data <- cbind(clinic[,c(1, 9, 10)], t(yellmatrix)) res.cut <- surv_cutpoint(surv_data, time = "Time", event = "Status", variables = names(surv_data)[4:ncol(surv_data)], minprop = 0.25) res.cat <- surv_categorize(res.cut) for (i in names(res.cat)[3:ncol(res.cat)]){ res.cat[[i]] <- factor(res.cat[[i]], levels = c('low', 'high')) } res <- data.frame() my.surv <- Surv(res.cat$Time, res.cat$Status) uni <- function(x){ group <- res.cat[[x]] survival_dat <- data.frame(group=group) fit <- survfit(my.surv ~ group) m=coxph(my.surv ~ group, data = survival_dat) tem <- summary(m) coef <- tem$coefficients[[1]] aoe <- cbind(as.data.frame(summary(m)[["coefficients"]]),as.data.frame(summary(m)[["conf.int"]])) abc <- t(aoe)[,1] HR <- round(abc[6],2) CI <- paste(paste(round(abc[8],2), round(abc[9],2), sep = " - "), sep = "") res_now <- data.frame(symbol = names(res.cat[x]), pvalue = abc[5], HR = HR, CI = CI, coef = coef) res_now } tem <- colnames(res.cat)[3:(ncol(res.cat)-1)] res <- map(tem, uni) tem_res <- res res <- c() for (i in c(1:length(tem_res))){ res <- rbind(res, tem_res[[i]]) } res <- res[order(res$pvalue), ] res_sig <- subset(res, res$pvalue < 0.05) module = "yellow" column = match(module, modNames); moduleGenes = moduleColors==module; sizeGrWindow(7, 7); par(mfrow = c(1,1)); pdf('wgcna_yellowGSMM.pdf', width = 7, height = 7) par(mar = c(5,7,5,3)) verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, 1]), xlab = paste("Module Membership in", module, "module"), ylab = "Gene significance for risk score", main = paste("Module membership vs. gene significance\n"), cex.main = 1.5, cex.lab = 2, cex.axis = 2, col = module) dev.off() ##mutation and immune risk score all(clinic$sampleID == names(rsem_yellowSig)) mutation <- read.csv('mutation_broad_gene.csv') rf_df <- mutation index = duplicated(rf_df$sample) sum(index == TRUE) rf_df <- rf_df[!index, ] row.names(rf_df) <- rf_df$sample rf_df <- rf_df[,-1] names(rf_df) <- str_replace_all(names(rf_df), '[.]', '-') tot <- apply(rf_df, 1, sum) index <- tot < 25 rf_df <- rf_df[!index, ] rf_df <- rf_df[,names(rf_df) %in% names(rsem_yellowSig)] rf_df <- rf_df[,match(names(rsem_yellowSig), names(rf_df))] rf_info <- clinic[,c(1, 12)] rf_df <- t(rf_df) %>% data.frame rf_df <- merge(rf_df, rf_info, by.x = 0, by.y = 1) row.names(rf_df) <- rf_df[,1] rf_df <- rf_df[,-1] features <- setdiff(names(rf_df), "Risk_score") optimal_ranger <- ranger( formula = Risk_score ~ ., data = rf_df, num.trees = 500, mtry = floor(length(features)/3), importance = 'impurity' ) mut_res <- optimal_ranger$variable.importance %>% data.frame(symbol = names(.), value =.) mut_res %>% dplyr::arrange(desc(value)) %>% dplyr::top_n(20) %>% ggplot(aes(reorder(symbol, value), value)) + geom_col() + coord_flip() + ggtitle("Top 20 important variables") + xlab('Gene Symbol') + ylab('Importance') + scale_y_reverse()