#######################################
# 
# Install packages for analysis 
#
#########################################	


 	rm(list=ls(all=TRUE))

	library(ggplot2)

	library(reshape2)

	library(randomForest)

	library(varSelRF)


############################################
#
# Read the data 
# Define which response and predictor set 
# Scale predictor matrix 
#
############################################

      
	data<-read.table("data.txt", header=T,sep="\t",row.names=1)
      
   	X<-data[,2:ncol(data)]  

        x<-scale(X)        # Scale data for variable comparisn 

	y<-as.factor(data[,1])

	dim(x)

	length(y)

##################################################

# Apply random forest 
# Store all the out put as .csv files 
##
##################################################
	
        rfRes<-randomForest(x=x,y=y,importance=T)

        rfRes

        varImpPlot(rfRes)

 	imp<-rfRes$importance

        OOB<-rfRes$err.rate*100 

        confusionMatrix<-rfRes$confusion

	write.csv(imp,"Imp_Data.csv")

        write.csv(confusionMatrix,"confusionMatrix_Data.csv")

        #write.csv(OOB,"OOB_error.csv")

         write.csv(OOB[500,1],"OOB_error.csv")


##################################################
#
# Apply variable selection using varSelRF 
# Store all the out put as .csv files 
#
##################################################

	rf.vs1 <- varSelRF(x, as.factor(y), ntree = 500, ntreeIterat = 300,vars.drop.frac = 0.2)

	rf.vs1

	selected<-rf.vs1$selected.vars

        Class<-data[,1]

        newDat<-cbind(Class,data[,selected])
        
        plot(newDat)


##################################################
#
# It will be interesting to check how selected variables are up/Down in different conditions  
# A box plot will be brwan automatically and store in the working directory
#
##################################################

	df.m <- melt(newDat, id.var = "Class")

	p <- ggplot(data = df.m, aes(x=variable, y=value)) + geom_boxplot(aes(fill=Class))

	p1<-p + facet_wrap( ~ variable, scales="free")

        ggsave(p1, file="boxPlot with selected.pdf")

	write.csv(selected,"selected.csv")

	write.csv(data[,selected],"selected_with_data_1_2.csv")

	write.csv(newDat,"selected_with_y.csv")


########################################################################################

# Itarate the model with selected variables and check how much variation is explained 

#######################################################################################


        rfRes1<-randomForest(x=data[,selected],y=y,importance=T)

        rfRes1 

        varImpPlot(rfRes1)

 	imp1<-rfRes1$importance

        OOB1<-rfRes1$err.rate*100 

        write.csv(imp1,"Imp_Data_after_Selection.csv")

        write.csv(confusionMatrix,"confusionMatrix_Data_after_Selection.csv")

        write.csv(OOB1[500,1],"OOB_error_after_Selection.csv")


#########################################################################################
# 
#  Permutation test for metabolomics data
#  To check if the class error is significantly higher/lower than randomised data
#  All permuted values are stored in the working directory for further inspection
#
#########################################################################################

	totiter<-1000

	err<-matrix(data=NA,nrow=dim(X)[2],ncol=totiter)
	RF_Imp<-matrix(data=NA,nrow=dim(X)[2],ncol=totiter)
#	selected<-matrix(data=NA,nrow=dim(X)[2],ncol=totiter)
	y2<-matrix(data=NA,nrow=dim(X)[1],ncol=totiter)
	RF_Imp1<-matrix(data=NA,nrow=dim(X)[2],ncol=totiter)
	RF_Imp2<-matrix(data=NA,nrow=dim(X)[2],ncol=totiter)

	for(l in 1:totiter)
	{

	index<-l

	print(l)

	y1<-sample(y)

	y2[,index]<-y1

	mydata<-cbind(y1,X)

	dim(mydata)

	rfres<-randomForest(y=y1,x=X,data=mydata,importance=T,na.action=na.omit)

        #RF_OP<-rfres$predicted

 	err[,index]<-rfres$err.rate[500,1][1]

 	imp<-rfres$importance
 	RF_Imp1[,index]<-imp[,1]
 	RF_Imp2[,index]<-imp[,2]


	classError<-as.vector(t(err[1,]*100))

} 
	
	plot(classError, pch=19)

       # abline(h=OOB[500:1])

	write.csv(classError,"ClassError_perm.csv")

	write.csv(RF_Imp1,"RF_imp_1_perm.csv")

	write.csv(RF_Imp1,"RF_imp_2_perm.csv")





##################################################
#
# End of the Script
#
#
##################################################