####################################### # # Install packages for analysis # ######################################### rm(list=ls(all=TRUE)) library(ggplot2) library(reshape2) library(randomForest) library(varSelRF) ############################################ # # Read the data # Define which response and predictor set # Scale predictor matrix # ############################################ data<-read.table("data.txt", header=T,sep="\t",row.names=1) X<-data[,2:ncol(data)] x<-scale(X) # Scale data for variable comparisn y<-as.factor(data[,1]) dim(x) length(y) ################################################## # Apply random forest # Store all the out put as .csv files ## ################################################## rfRes<-randomForest(x=x,y=y,importance=T) rfRes varImpPlot(rfRes) imp<-rfRes$importance OOB<-rfRes$err.rate*100 confusionMatrix<-rfRes$confusion write.csv(imp,"Imp_Data.csv") write.csv(confusionMatrix,"confusionMatrix_Data.csv") #write.csv(OOB,"OOB_error.csv") write.csv(OOB[500,1],"OOB_error.csv") ################################################## # # Apply variable selection using varSelRF # Store all the out put as .csv files # ################################################## rf.vs1 <- varSelRF(x, as.factor(y), ntree = 500, ntreeIterat = 300,vars.drop.frac = 0.2) rf.vs1 selected<-rf.vs1$selected.vars Class<-data[,1] newDat<-cbind(Class,data[,selected]) plot(newDat) ################################################## # # It will be interesting to check how selected variables are up/Down in different conditions # A box plot will be brwan automatically and store in the working directory # ################################################## df.m <- melt(newDat, id.var = "Class") p <- ggplot(data = df.m, aes(x=variable, y=value)) + geom_boxplot(aes(fill=Class)) p1<-p + facet_wrap( ~ variable, scales="free") ggsave(p1, file="boxPlot with selected.pdf") write.csv(selected,"selected.csv") write.csv(data[,selected],"selected_with_data_1_2.csv") write.csv(newDat,"selected_with_y.csv") ######################################################################################## # Itarate the model with selected variables and check how much variation is explained ####################################################################################### rfRes1<-randomForest(x=data[,selected],y=y,importance=T) rfRes1 varImpPlot(rfRes1) imp1<-rfRes1$importance OOB1<-rfRes1$err.rate*100 write.csv(imp1,"Imp_Data_after_Selection.csv") write.csv(confusionMatrix,"confusionMatrix_Data_after_Selection.csv") write.csv(OOB1[500,1],"OOB_error_after_Selection.csv") ######################################################################################### # # Permutation test for metabolomics data # To check if the class error is significantly higher/lower than randomised data # All permuted values are stored in the working directory for further inspection # ######################################################################################### totiter<-1000 err<-matrix(data=NA,nrow=dim(X)[2],ncol=totiter) RF_Imp<-matrix(data=NA,nrow=dim(X)[2],ncol=totiter) # selected<-matrix(data=NA,nrow=dim(X)[2],ncol=totiter) y2<-matrix(data=NA,nrow=dim(X)[1],ncol=totiter) RF_Imp1<-matrix(data=NA,nrow=dim(X)[2],ncol=totiter) RF_Imp2<-matrix(data=NA,nrow=dim(X)[2],ncol=totiter) for(l in 1:totiter) { index<-l print(l) y1<-sample(y) y2[,index]<-y1 mydata<-cbind(y1,X) dim(mydata) rfres<-randomForest(y=y1,x=X,data=mydata,importance=T,na.action=na.omit) #RF_OP<-rfres$predicted err[,index]<-rfres$err.rate[500,1][1] imp<-rfres$importance RF_Imp1[,index]<-imp[,1] RF_Imp2[,index]<-imp[,2] classError<-as.vector(t(err[1,]*100)) } plot(classError, pch=19) # abline(h=OOB[500:1]) write.csv(classError,"ClassError_perm.csv") write.csv(RF_Imp1,"RF_imp_1_perm.csv") write.csv(RF_Imp1,"RF_imp_2_perm.csv") ################################################## # # End of the Script # # ##################################################