#THIS SCRIPT CARRIES OUT VALIDATION FOR PREDICTIONS UNDER THE PHYLOGENETIC REGRESSION + LAMBDA MODEL

rm(list=ls())

library(phytools)
library(geiger)
library(phylolm)
source("phyloPred.jknife.R")
source("starPhyloPred.jknife.R")


#GET DATA AND TREE
fullTree<-read.newick("Rabosky_etal2014.timetree.nex")
	fullTree$tip.label[grep("Acanthopagrus_schlegelii",fullTree$tip.label)]<-"Acanthopagrus_schlegelii"
	fullTree$tip.label[grep("Auxis_thazard",fullTree$tip.label)]<-"Auxis_thazard"
	fullTree$tip.label[grep("Auxis_rochei",fullTree$tip.label)]<-"Auxis_rochei"
	fullTree$tip.label[grep("Carassius_auratus",fullTree$tip.label)]<-"Carassius_auratus"
	fullTree$tip.label[grep("Diagramma_picta",fullTree$tip.label)]<-"Diagramma_pictum"
	fullTree$tip.label[grep("Helicolenus_dactylopterus",fullTree$tip.label)]<-"Helicolenus_dactylopterus"
	fullTree$tip.label[grep("Oncorhynchus_masou",fullTree$tip.label)]<-"Oncorhynchus_masou"
	fullTree$tip.label[grep("Osmerus_mordax_mordax",fullTree$tip.label)]<-"Osmerus_mordax"
	fullTree$tip.label[grep("Salvelinus_alpinus",fullTree$tip.label)]<-"Salvelinus_alpinus"
	fullTree$tip.label[grep("Salvelinus_leucomaenis_leucomaenis",fullTree$tip.label)]<-"Salvelinus_leucomaenis"	
	fullTree$tip.label[grep("Sebastes_pachycephalus_pachycephalus",fullTree$tip.label)]<-"Sebastes_pachycephalus"
	fullTree$tip.label[grep("Thymallus_arcticus",fullTree$tip.label)]<-"Thymallus_arcticus"

fullDat<-read.csv("bionut_working.csv")
	rownames(fullDat)<-fullDat[,1]
	fullDat<-fullDat[,-1]


datSp<-rownames(fullDat)

species<-datSp[which(match(datSp, fullTree$tip.label, nomatch=0)!=0)]
exclude<-fullTree$tip.label[which(match(fullTree$tip.label, datSp, nomatch=0)==0)]

tree<-drop.tip(fullTree, exclude)


nutDat<-fullDat[species,c("protein", "fat", "fac3_pol", "fac6_pol", "iron", "zinc", "ret_eq", "vitb12", "vitd")]
nutDat<-data.frame(nutDat[,"protein"], sapply(c("fat", "fac3_pol", "fac6_pol", "iron", "zinc", "ret_eq", "vitb12", "vitd"), function(x) log(nutDat[,x]+1)))
	colnames(nutDat)[1]<-"protein"

lifeHistDat<-fullDat[species, c("maxlen", "troph", "depmax")]
	
nutList<-lapply(1:ncol(nutDat), function(x) data.frame(lifeHistDat[which(is.na(nutDat[,x])==FALSE),], nutDat[which(is.na(nutDat[,x])==FALSE),x]))
	names(nutList)<-colnames(nutDat)
	for (i in 1:length(nutList)) {colnames(nutList[[i]])[ncol(nutList[[i]])]<-names(nutList)[i]}
for (i in 1:length(nutList)) {
	for (j in 1:ncol(nutList[[i]])){
		nutList[[i]]<-nutList[[i]][which(is.na(nutList[[i]][,j])==FALSE),]
	}
}

	
treeList<-lapply(1:length(nutList), function(x) drop.tip(tree, tree$tip.label[which(match(tree$tip.label, rownames(nutList[[x]]), nomatch=0)==0)]))


#pgls.table<-matrix(, length(nutList), 18, dimnames=list(names(nutList), c("n", "lambda", "b0", "b1_maxlen", "b1_troph", "b1_depmax", "se_b0", "se_b1_maxlen", "se_b1_troph", "se_b1_depmax", "tstat_b0", "tstat_maxlen", "tstat_troph", "tstat_depmax", "P_b0", "P_maxlen", "P_troph", "P_depmax")))


errorMat<-matrix(,length(nutList),3, dimnames=list(names(nutList), c("median_diff_percMean", "median_diff_nSD", "errorRate")))

for (i in 1:length(nutList)){

	pred<-matrix(, nrow(nutList[[i]]), 6, dimnames=list(rownames(nutList[[i]]), c("measuredVal", "predVal", "lcl_predInt", "ucl_predInt", "diff_percMean", "diff_nSampleSD")))
	withinCIMat<-matrix(, nrow(nutList[[i]]), 1, dimnames=list(rownames(nutList[[i]]), "withinCI"))

	maxlen<-nutList[[i]][,"maxlen"]
		names(maxlen)<-rownames(nutList[[i]])
	troph<-nutList[[i]][,"troph"]
		names(troph)<-rownames(nutList[[i]])
	depmax<-nutList[[i]][,"depmax"]
		names(depmax)<-rownames(nutList[[i]])
		
	nut<-nutList[[i]][,4]
		names(nut)<-rownames(nutList[[i]])


	for (j in 1:nrow(nutList[[i]])){
					
		maxlen.rm<-maxlen[-j]
		troph.rm<-troph[-j]
		depmax.rm<-depmax[-j]
		nut.rm<-nut[-j]
	
		tree.rm<-drop.tip(treeList[[i]], rownames(nutList[[i]])[j])
	
		result<-phylolm(nut.rm ~ maxlen.rm + troph.rm + depmax.rm, phy=tree.rm, model="lambda")
		
		coeff<-c(result[[3]], result$coeff)
			names(coeff)<-c("lambda", "b0", "b1_maxlen", "b1_troph", "b1_depmax")
			
		Yhat<-(coeff["b1_maxlen"] * maxlen) + (coeff["b1_troph"] * troph) + (coeff["b1_depmax"] * depmax) + coeff["b0"]
		
		res<- nut - Yhat
		
		pred.res<-phyloPred.jknife(treeList[[i]], res, rownames(nutList[[i]])[j])
		
		predVal<-Yhat[j] + as.numeric(pred.res$PredVal)
		measVal<-as.numeric(nut[j])
		diff<-abs(measVal - predVal)
		predSE<-as.numeric(pred.res$PredSE)
		diff_nPredSE<-diff / predSE
		if (diff_nPredSE < 2) withinCIMat[j]<-TRUE else withinCIMat[j]<-FALSE
		
		lclPredInt<-predVal - (1.96 * predSE)
		uclPredInt<-predVal + (1.96 * predSE)
		
		if (i > 1) predVal<-exp(predVal) - 1
		if (i > 1) measVal<-exp(nut[j]) - 1
		if (i > 1) lclPredInt<-exp(lclPredInt) - 1
		if (i > 1) uclPredInt<-exp(uclPredInt) - 1
		if (i > 1) nutSD<-sd(exp(nut) -1) else nutSD<-sd(nut)

		if (measVal == 0) diff1<-NA else diff1<-as.numeric((abs(measVal - predVal) / measVal) * 100)
		if (measVal == 0) diff2<-NA else diff2<-as.numeric((abs(measVal - predVal) / nutSD))
						
		pred[j,]<-round(c(measVal, predVal, lclPredInt, uclPredInt, diff1, diff2), 3)
		
	}

	if (i == 1) write.csv(pred, file="phyloPredValid.PGLS.protein.csv")
	if (i == 2) write.csv(pred, file="phyloPredValid.PGLS.lipid.csv")
	if (i == 3) write.csv(pred, file="phyloPredValid.PGLS.omega3.csv")
	if (i == 4) write.csv(pred, file="phyloPredValid.PGLS.omega6.csv")
	if (i == 5) write.csv(pred, file="phyloPredValid.PGLS.iron.csv")
	if (i == 6) write.csv(pred, file="phyloPredValid.PGLS.zinc.csv")
	if (i == 7) write.csv(pred, file="phyloPredValid.PGLS.vitA.csv")
	if (i == 8) write.csv(pred, file="phyloPredValid.PGLS.vitB12.csv")
	if (i == 9) write.csv(pred, file="phyloPredValid.PGLS.vitD.csv")

	errorMat[i,1]<-median(pred[,"diff_percMean"], na.rm=TRUE)
	errorMat[i,2]<-median(pred[,"diff_nSampleSD"], na.rm=TRUE)
	errorMat[i,3]<-length(which(withinCIMat == FALSE))/nrow(pred)

		
	write.csv(errorMat, file="phyloPredValidPGLS.error.csv")
	
}