library(tidyverse)
library(stringr)
library(edgeR)
library(limma)
library(org.Hs.eg.db)
library(clusterProfiler)
library(pheatmap)
library(survival)
library(survminer)
library(DESeq2)
library(glmnet)
library(survival)
library(regplot)
library(rms)
library(survivalROC)
library(DESeq2)
library(org.Hs.eg.db)
library(limma)
library(edgeR)
library(MASS)
library(stringr)
library(cowplot)
library(ranger)

##data clean
big <- theme(axis.title.x=element_text(size = 20),
             axis.title.y=element_text(size = 20),
             axis.text.x=element_text(size = 20,color = "black"),
             axis.text.y=element_text(size = 20,color = "black")) 
small <- theme(axis.title.x=element_text(size = 15),
               axis.title.y=element_text(size = 15),
               axis.text.x=element_text(size = 10,color = "black"),
               axis.text.y=element_text(size = 15,color = "black"))
clinic <- read.csv('luad_early_clinic.csv', row.names = 1)
immunity <- read.csv('xCell_LUAD_24 immune cells.csv', row.names = 1, header = TRUE)
immunity <- data.frame(t(immunity))
rownames(immunity) <- str_replace_all(rownames(immunity), '[.]', '-')
immunity <- immunity[rownames(immunity) %in% rownames(clinic), ]
clinic <- clinic[rownames(clinic) %in% rownames(immunity), ]
immunity <- immunity[match(rownames(clinic), rownames(immunity)), ]
all(rownames(immunity) == rownames(clinic))


immunity_cox <- merge(clinic[,c(8, 9)], immunity, by.x = 0, by.y = 0)
names(immunity_cox)[1] <- 'sampleID'
res.cut <- surv_cutpoint(immunity_cox, time = "Time", 
                         event = "Status", 
                         variables = names(immunity_cox)[4:31], 
                         minprop = 0.2)
res.cat <- surv_categorize(res.cut)
res <- data.frame()
my.surv <- Surv(res.cat$Time, res.cat$Status)
for (i in colnames(res.cat)[3:length(colnames(res.cat))]) {
  res.cat[[i]] <- factor(res.cat[[i]], levels = c('low', 'high'))
  group <- res.cat[,i] 
  survival_dat <- data.frame(group=group)
  fit <- survfit(my.surv ~ group)
  m=coxph(my.surv ~ group, data = survival_dat)
  aoe <- cbind(as.data.frame(summary(m)[["coefficients"]]),as.data.frame(summary(m)[["conf.int"]]))
  abc <- t(aoe)[,1]
  HR <- paste("Hazard Ratio = ", round(abc[6],2), sep = "")
  CI <- paste("95% CI: ", paste(round(abc[8],2), round(abc[9],2), sep = " - "), sep = "")
  if (abc[5] < 0.05){
    res_now <- data.frame(symbol = names(res.cat[i]), pvalue = abc[5], HR = HR, CI = CI)
    res <- rbind(res, res_now)
  }
  res$symbol <- as.character(res$symbol)
}
res <- res[order(res$pvalue), ]
res_population <- res
tem_clinic <- clinic
clinic <- merge(tem_clinic, immunity_cox[,c(1, 26)], by.x = 0, by.y = 1)
names(clinic)[1] <- 'sampleID'

##WGCNA
rsem <- read.csv('LUAD_RNAseq.csv', row.names = 1)
names(rsem) <- str_replace_all(names(rsem), '[.]','-')
rsem <- rsem[,names(rsem) %in% clinic$sampleID]
immune_gene <- read.csv('InnateDB_genes.csv')
rsem_immunity <- rsem[rownames(rsem) %in% immune_gene$name, ]
rsem_immunity <- rsem_immunity[, match(clinic$sampleID, names(rsem_immunity))]
all(names(rsem_immunity) == clinic$sampleID)
wgcna_seq <- rsem_immunity
traitData <- clinic[,c(1, 11)]
low_count_mask <- rowSums(wgcna_seq) < ncol(wgcna_seq) 
raw_counts_filter <- wgcna_seq[which(low_count_mask==FALSE),]
datExpr0 = as.data.frame(t(raw_counts_filter))
gsg = goodSamplesGenes(datExpr0, verbose = 3)
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
sampleTree = hclust(dist(datExpr0), method = "average")
sizeGrWindow(12,9)
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, 
     cex.axis = 1.5, cex.main = 2)
abline(h = 200, col = "red")
clust = cutreeStatic(sampleTree, cutHeight = 250, minSize = 10)
table(clust)
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
microSample = rownames(datExpr)
length(microSample)
nrow(traitData)
datTraits = traitData[traitData$sampleID %in% microSample, ]
datExpr <- datExpr[row.names(datExpr) %in% datTraits$sampleID, ]
nrow(datExpr)
head(datTraits)
datExpr[1:3, 1:10]
powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
pdf('wgcna_sft_selection.pdf')
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
abline(h=0.80,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()
net = blockwiseModules(datExpr, power = 3,
                       TOMType = "signed", minModuleSize = 50,
                       reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       verbose = 3, maxBlockSize = 20000)
mergedColors = labels2colors(net$colors)
mergedColors = labels2colors(net$colors)
pdf('wgcna_module.pdf')
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
dev.off()
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
table(net$colors)
sizeGrWindow(12, 9)
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
datTraits = traitData[traitData$sampleID %in% microSample,]
index <-datTraits$sampleID 
datTraits <- data.frame(Mast.cells = datTraits$Mast.cells)
rownames(datTraits) <- index
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
sizeGrWindow(10,6)
textMatrix =  paste(signif(moduleTraitCor, 2), "\n(",
                    signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
pdf('wgcna_heatmap.pdf', width = 4, height = 7)
par(mar = c(6, 10, 2, 2));
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(datTraits),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 1,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))
dev.off()
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="")
Mast.cells = as.data.frame(datTraits$Mast.cells);
names(Mast.cells) = "Mast.cells"
row.names(Mast.cells)<-row.names(datTraits)
geneTraitSignificance = as.data.frame(cor(datExpr, Mast.cells, use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) = paste("GS.", names(Mast.cells), sep="");
MET = orderMEs(cbind(MEs, Mast.cells))
names(GSPvalue) = paste("p.GS.", names(Mast.cells), sep="");
MET = orderMEs(cbind(MEs, Mast.cells))
module = c("yellow")
column = match(module, modNames);
moduleGenes = moduleColors==module;
yellow_column<-geneModuleMembership[, column]
names(yellow_column) <- row.names(geneModuleMembership)
geneModuleMembership[rownames(geneModuleMembership) == 'IL1B', ]
yellow_MN <- yellow_column[moduleGenes]
GS.mir<-geneTraitSignificance[, 1]
names(GS.mir)<-row.names(geneTraitSignificance)
GS.mir_yellow<-GS.mir[moduleGenes]
yellow_module<-cbind(yellow_MN,GS.mir_yellow)
yellow_module<-yellow_module[order(yellow_module[,2],decreasing=TRUE),]
yellow_module <- data.frame(yellow_module)
yellmatrix <- rsem_immunity[row.names(rsem_immunity) %in% row.names(yellow_module), ]
all(names(yellmatrix) == clinic$sampleID)
surv_data <- cbind(clinic[,c(1, 9, 10)], t(yellmatrix))
res.cut <- surv_cutpoint(surv_data, time = "Time", 
                         event = "Status", 
                         variables = names(surv_data)[4:ncol(surv_data)], 
                         minprop = 0.25)
res.cat <- surv_categorize(res.cut)
for (i in names(res.cat)[3:ncol(res.cat)]){
  res.cat[[i]] <- factor(res.cat[[i]], levels = c('low', 'high'))
}
res <- data.frame()
my.surv <- Surv(res.cat$Time, res.cat$Status)
uni <- function(x){
  group <- res.cat[[x]]
  survival_dat <- data.frame(group=group)
  fit <- survfit(my.surv ~ group)
  m=coxph(my.surv ~ group, data = survival_dat)
  tem <- summary(m)
  coef <- tem$coefficients[[1]]
  aoe <- cbind(as.data.frame(summary(m)[["coefficients"]]),as.data.frame(summary(m)[["conf.int"]]))
  abc <- t(aoe)[,1]
  HR <- round(abc[6],2)
  CI <- paste(paste(round(abc[8],2), round(abc[9],2), sep = " - "), sep = "")
  res_now <- data.frame(symbol = names(res.cat[x]), pvalue = abc[5], HR = HR, CI = CI, coef = coef)
  res_now
}
tem <- colnames(res.cat)[3:(ncol(res.cat)-1)]
res <- map(tem, uni)
tem_res <- res
res <- c()
for (i in c(1:length(tem_res))){
  res <- rbind(res, tem_res[[i]])
}
res <- res[order(res$pvalue), ]
res_sig <- subset(res, res$pvalue < 0.05)
module = "yellow"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
pdf('wgcna_yellowGSMM.pdf', width = 7, height = 7)
par(mar = c(5,7,5,3))
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene significance for risk score",
                   main = paste("Module membership vs. gene significance\n"),
                   cex.main = 1.5, cex.lab = 2, cex.axis = 2, col = module)
dev.off()



##mutation and immune risk score
all(clinic$sampleID == names(rsem_yellowSig))
mutation <- read.csv('mutation_broad_gene.csv')
rf_df <- mutation
index = duplicated(rf_df$sample)
sum(index == TRUE)
rf_df <- rf_df[!index, ]
row.names(rf_df) <- rf_df$sample
rf_df <- rf_df[,-1]
names(rf_df) <- str_replace_all(names(rf_df), '[.]', '-')
tot <- apply(rf_df, 1, sum)
index <- tot < 25
rf_df <- rf_df[!index, ]
rf_df <- rf_df[,names(rf_df) %in% names(rsem_yellowSig)]
rf_df <- rf_df[,match(names(rsem_yellowSig), names(rf_df))]
rf_info <- clinic[,c(1, 12)]
rf_df <- t(rf_df) %>%
  data.frame
rf_df <- merge(rf_df, rf_info, by.x = 0, by.y = 1)
row.names(rf_df) <- rf_df[,1]
rf_df <- rf_df[,-1]
features <- setdiff(names(rf_df), "Risk_score")
optimal_ranger <- ranger(
  formula         = Risk_score ~ ., 
  data            = rf_df, 
  num.trees = 500,
  mtry      = floor(length(features)/3),
  importance      = 'impurity'
)
mut_res <- optimal_ranger$variable.importance %>%
  data.frame(symbol = names(.), value =.)
mut_res %>%
  dplyr::arrange(desc(value)) %>%
  dplyr::top_n(20) %>%
  ggplot(aes(reorder(symbol, value), value)) +
  geom_col() +
  coord_flip() +
  ggtitle("Top 20 important variables") +
  xlab('Gene Symbol') +
  ylab('Importance') + scale_y_reverse()