# tcgaLUADSurvival.R
#
# Stage I Lung Adenocarcinoma Patient Survival Prediction with Quantitative Image Features
#

rm(list=ls())

# load required R packages
library(survival)
library(glmnet)
library(GGally)
library(ggplot2)
library(rms)

# read in feature data
load("../../tcgaLUADData.RData")

## Stage I only
stageI<-(X[,9880]>0 & X[,9880]<20)
X<-X[stageI,1:9879]
cellProfilerIDReps<-cellProfilerIDReps[stageI]
cellProfilerFileName<-cellProfilerFileName[stageI]
survivedDaysReps<-survivedDaysReps[stageI]+1
eventReps<-eventReps[stageI]
Ymatrix<-Ymatrix[stageI]
Ymatrix<-Surv(Ymatrix[,1],Ymatrix[,2])


# initialize survivedDays, event, and arrays to store the results
survivedDays<-Ymatrix[,1]
event<-Ymatrix[,2]
alpha<-0.1
predTest<-rep(0,dim(X)[1])
predTestGroup<-rep(0,dim(X)[1])
results<-array(0,dim=c(1,6))


# run leave-one-out cross-validation
for (fold in 1:dim(X)[1]){
  print(fold)
  testSet<-fold
  trainingSet<-which(!(1:dim(X)[1] %in% testSet))
  
  # train a Cox model
  cv.tr<-cv.glmnet(as.matrix(X[trainingSet,]),Ymatrix[trainingSet],family='cox',alpha=alpha,nfolds=10)
  if (cv.tr$lambda.min==cv.tr$lambda[1]){
    predictSIndex=2
  } else {
    predictSIndex=which(cv.tr$lambda==cv.tr$lambda.min)
  }
  
  # make prediction on the test case
  predAll<-predict(cv.tr,as.matrix(X),s=cv.tr$lambda[predictSIndex],type='response')
  predTrain<-predict(cv.tr,as.matrix(X[trainingSet,]),s=cv.tr$lambda[predictSIndex],type='response')
  predTest[fold]<-predict(cv.tr,as.matrix(t(as.data.frame(X[testSet,]))),s=cv.tr$lambda[predictSIndex],type='response')
  
  nPGroups<-2
  seqBreaks<-c(0,0.5,1)
  
  # use the median of survival indices in the training set as the threshold
  thres<-median(predTrain)
  
  plotTrain<-data.frame(predTrain,survivedDays[trainingSet],event[trainingSet])
  colnames(plotTrain)<-c("predTrain","V2","V3")

  # classify patients in the training set
  for (i in 1:dim(plotTrain)[1]){
    if (predTrain[i]<thres){
      plotTrain[i,1]<-"Group 1"
    } else {
      plotTrain[i,1]<-"Group 2"
    }  
  }
  plotTrain.surv <- survfit(Surv(V2, V3) ~ predTrain, data = plotTrain) 

  # classify the patient in the test set
  if (as.numeric(predTest[fold])<thres){
      predTestGroup[fold]<-"Group 1"
  } else if (as.numeric(predTest[fold])>=thres){
      predTestGroup[fold]<-"Group 2"
  }
}


# build a data frame that stores the actual and predicted survival for the set
plotTest<-data.frame(predTestGroup,survivedDays,event)
colnames(plotTest)<-c("predTest","V2","V3")
  
  
# calculate performance statistics
sdfTrain<-survdiff(Surv(survivedDays[trainingSet], event[trainingSet]) ~ predTrain, plotTrain)
sdfTest<-survdiff(Surv(survivedDays, event) ~ plotTest[,1], plotTest)
pTrain<-1-pchisq(sdfTrain$chisq, length(sdfTrain$n)-1)
pTest<-1-pchisq(sdfTest$chisq, length(sdfTest$n)-1)
trainConcordance<-survConcordance(Surv(survivedDays[trainingSet], event[trainingSet]) ~ predTrain, as.data.frame(X[trainingSet,]))
testConcordance<-survConcordance(Surv(survivedDays, event) ~ predTest, as.data.frame(X))
results[1]<-cv.tr$lambda[predictSIndex]
results[2]<-cv.tr$glmnet.fit$df[predictSIndex]
results[3]<-trainConcordance$concordance
results[4]<-testConcordance$concordance
results[5]<-pTrain
results[6]<-pTest

# build survival curves for the training set
plotTrainGGsurv<-plotTrain
plotTrainGGsurv[,2]<-plotTrainGGsurv[,2]/30
plotTrain.surv <- survfit(Surv(V2, V3) ~ predTrain, data = plotTrainGGsurv)

# build survival curves for the test set
plotTestGGsurv<-plotTest
plotTestGGsurv[,2]<-plotTestGGsurv[,2]/30
plotTest.surv <- survfit(Surv(V2, V3) ~ predTest, data = plotTestGGsurv)

# output survival curves for the training set
pdf(paste("LOOCV1LambdaAuto/0.5/plots/plotTrainLambda",results[1],"Alpha",alpha,"Groups",nPGroups,".pdf", sep=""), width=11, height=7)
print(
ggsurv(plotTrain.surv) +
  guides(linetype = F) + 
  xlab("Months") + 
  ylab("Probability of Survival") +
  ylim(0,1) +
  theme(plot.title = element_text(size=24,face="bold"),
        legend.text=element_text(size=24), 
        legend.title=element_text(size=24), 
        axis.text=element_text(size=18), 
        axis.title=element_text(size=24,face="bold"),
        legend.justification=c(1,1),
        legend.position=c(1,1)) +
  scale_colour_discrete(name = 'Prognostic Groups')
)
dev.off()

# output survival curves for the test set
pdf(paste("plotTestLambda",results[1],"Alpha",alpha,"Groups",nPGroups,".pdf", sep=""), width=11, height=7)
print(
ggsurv(plotTest.surv) + 
  guides(linetype = F) + 
  xlab("Months") +
  ylab("Probability of Survival") +
  ylim(0,1) +
  theme(plot.title = element_text(size=24,face="bold"),
        legend.text=element_text(size=24), 
        legend.title=element_text(size=24), 
        axis.text=element_text(size=18), 
        axis.title=element_text(size=24,face="bold"),
        legend.justification=c(1,1),
        legend.position=c(1,1)) +
  scale_colour_discrete(name = 'Prognostic Groups')
)
dev.off()

# output performance statistics
write.table(results, paste("LUADStageIAlpha",alpha,"Results.csv",sep=""), quote=F, row.names=F, col.names=F, sep=",")