# Estimation of SAOM models using RSiena, calculation of post-hoc goodness of fit tests
# For 2018 paper "Social interactions shape individual and collective personality in social spiders"
######################################################################################

rm(list=ls())

cpus=7 #The number of CPUs used for parallel processing on a cluster.
       #Because we had 8 CPUs, use 7 and leave 1 spare.

setwd("/xx")

#Below are two functions used for the post-hoc GOF tests

###############
#For Geodesic distance we use this function here
GeodesicDistribution <- function (i, data, sims, period, groupName,
		varName, levls=c(1:5,Inf), cumulative=TRUE, ...) {
	x <- networkExtraction(i, data, sims, period, groupName, varName)
	require(sna)
	a <- sna::geodist(x)$gdist
	if (cumulative)
	{
		gdi <- sapply(levls, function(i){ sum(a<=i) })
	}
	else
	{
		gdi <- sapply(levls, function(i){ sum(a==i) })
	}
	names(gdi) <- as.character(levls)
	gdi
}


# Holland and Leinhardt Triad Census; see ?sna::triad.census.
TriadCensus <- function(i, data, sims, wave, groupName, varName, levls=1:16){
	unloadNamespace("igraph") # to avoid package clashes
	require(network)
	require(sna)
	x <- networkExtraction(i, data, sims, wave, groupName, varName)
	if (network.edgecount(x) <= 0){x <- symmetrize(x)}
	# because else triad.census(x) will lead to an error
	tc <- sna::triad.census(x)[1,levls]
	# names are transferred automatically
	tc
}
#############################################################

require(abind)  #use abind package to create sequential matrices for the 19 observations
require(RSiena) #the below used RSiena 1.2-3 (September 8, 2017)

cuddling=read.csv('spider_networks.csv') #by 'cuddling', we mean resting interaction networks...
boldness= read.csv('spider_boldness.csv', header=TRUE)

#Allocating source colony number labels
cuddling$Source=as.character(cuddling$Colony)
cuddling$Source[substring(cuddling$Source,1,4)=="BR10"]=1
cuddling$Source[substring(cuddling$Source,1,4)=="BR13"]=2
cuddling$Source[substring(cuddling$Source,1,3)=="BR3"]=3
cuddling$Source=as.factor(cuddling$Source)

#Allocating treatment type labels
cuddling$Treat.Type=as.character(cuddling$Colony)
cuddling$Treat.Type[cuddling$Treat.Type=="BR10-1"]="allbold"
cuddling$Treat.Type[cuddling$Treat.Type=="BR10-10"]="allbold"
cuddling$Treat.Type[cuddling$Treat.Type=="BR10-2"]="allbold"
cuddling$Treat.Type[cuddling$Treat.Type=="BR13-5"]="allbold"
cuddling$Treat.Type[cuddling$Treat.Type=="BR13-9"]="allbold"

cuddling$Treat.Type[cuddling$Treat.Type=="BR10-3"]="keystone"
cuddling$Treat.Type[cuddling$Treat.Type=="BR10-4"]="keystone"
cuddling$Treat.Type[cuddling$Treat.Type=="BR10-5"]="keystone"
cuddling$Treat.Type[cuddling$Treat.Type=="BR10-9"]="keystone"
cuddling$Treat.Type[cuddling$Treat.Type=="BR13-1"]="keystone"
cuddling$Treat.Type[cuddling$Treat.Type=="BR13-2"]="keystone"
cuddling$Treat.Type[cuddling$Treat.Type=="BR13-6"]="keystone"
cuddling$Treat.Type[cuddling$Treat.Type=="BR3-2"]="keystone"
cuddling$Treat.Type[cuddling$Treat.Type=="BR3-3"]="keystone"
cuddling$Treat.Type[cuddling$Treat.Type=="BR3-4"]="keystone"

cuddling$Treat.Type[cuddling$Treat.Type=="BR10-6"]="allshy"
cuddling$Treat.Type[cuddling$Treat.Type=="BR10-7"]="allshy"
cuddling$Treat.Type[cuddling$Treat.Type=="BR10-8"]="allshy"
cuddling$Treat.Type[cuddling$Treat.Type=="BR13-3"]="allshy"
cuddling$Treat.Type[cuddling$Treat.Type=="BR13-4"]="allshy"
cuddling$Treat.Type[cuddling$Treat.Type=="BR13-7"]="allshy"
cuddling$Treat.Type[cuddling$Treat.Type=="BR13-8"]="allshy"
cuddling$Treat.Type[cuddling$Treat.Type=="BR3-1"]="allshy"
cuddling$Treat.Type[cuddling$Treat.Type=="BR3-5"]="allshy"

cuddling$Treat.Type=as.factor(cuddling$Treat.Type)

boldness[2:8]=600-boldness[2:8]  #So that boldness increases with quicker response times
names(boldness)[2:8]=1:7

unq_treat=unique(cuddling$Treat.Type)
unq_ass=unique(cuddling$Assay.Type)

#One can set this up as a batch to estimate all 8 models at once; in practice this doesn't work very well
#because some manual checking/tweaking is necessary along the way to ensure convergence is being achieved
#Therefore work through the models one by one specifying s=1, t=1, etc
#t=1 (allbold), t=2 (keystone), t=3 (all shy)

#n=1 #8 cases (3 source colonies x 3 treatment types, -1 non-existent case)
#for (s in 1:3){
#	for (t in 1:3){
#		if (!((s==3) & (t==1))){  #need to skip case for s=3, t=1 (allbold), no data
s=1
t=1
			#name="s1t1_minimal6.RData"
			#namep="RS_minimal_s1t1"

name_save=paste("s",s,"t",t,"_final.RData",sep="") #name of the final save final
namep=paste("RS_19_s",s,"t",t,sep="")              #name of the RSiena output project file


cuddling_pool=cuddling[cuddling$Source==s&cuddling$Treat.Type==unq_treat[t],] #grouping networks by source colony s and treatment t
unq_colony=unique(cuddling_pool$Colony) #list of colonies present inside newly created subgroup

source_treat_net=list()
source_treat_bold=list()

for (i in 1:length(unq_colony)){
	
	ix=unq_colony[i]
	ind=boldness$Individual[boldness$colony==ix]
	net_series=matrix(data=NA,length(ind),length(ind)) #to be series of networks t=1,...,19
	net_bold=matrix(data=NA,1,length(ind))             #to be series of boldness measurements
	
	for (jx in 1:7){ 
		for (k in 1:3){
			
			kx=unq_ass[k]
			bld=boldness[boldness$colony==ix,jx+1]
			atts=data.frame(ind,bld)
			sp1=cuddling_pool$Spider.1[cuddling_pool$Colony==ix&cuddling_pool$Week==jx&cuddling_pool$Assay.Type==kx]
			sp2=cuddling_pool$Spider.2[cuddling_pool$Colony==ix&cuddling_pool$Week==jx&cuddling_pool$Assay.Type==kx]
			
			#require(igraph) #Can be necessary to reload each iteration because of sna package's masking of functions, if GOF is calculated
			net=graph.data.frame(data.frame(sp1,sp2),vertices=atts,directed=FALSE)
			net_bold=abind(net_bold,t(as.matrix(V(net)$bld)),along=3)
			
			net_adj=as_adj(net,sparse=FALSE)
			dead=ind[is.na(bld)] #identify which spiders have died (boldness will be NA)
			
			#method of structural zeros for dead spiders, p.30 / 4.3.1
			for (d in 1:length(dead)){
				net_adj[dimnames(net_adj)[[1]]==dead[d],]=10  #10 indicates a structural zero in adjacency matrix (cannot form connections)
				net_adj[,dimnames(net_adj)[[1]]==dead[d]]=10
			}
			
			net_series=abind(net_series, net_adj, along = 3)	#bind series of networks/boldness measurements along 3rd dimension
		}
	}
	
	source_treat_net[[i]]=net_series #obtain 2-4 sequences of observations across multiple colonies, which then need to be combined into one sequence
	source_treat_bold[[i]]=net_bold
}

nodes=0 #while most colonies start with 10 spiders, a very few start with 11, so need to get suitable dimension (e.g. may be 41x41 adjacency matrix)
for (i in 1:length(unq_colony)){
	nodes=nodes+dim(source_treat_net[[i]])[1] 
}

pooled_series=array(10,dim=c(nodes,nodes,22)) #pooled array full of structural zeros 
pooled_bold=array(0,dim=c(nodes,22))

r=1
for (i in 1:length(unq_colony)){ #now place colony adjacency matrices into one large shared matrix, with structural zeros to prevent between-colony interaction
	
	toadd=dim(source_treat_net[[i]])[1]
	
	pooled_series[r:((r-1)+toadd),r:((r-1)+toadd),]=source_treat_net[[i]]		
	pooled_bold[r:((r-1)+toadd),]=source_treat_bold[[i]]
	r=r+toadd
	
}


pooled_series=pooled_series[,,2:20] #truncate empty first index
pooled_bold=pooled_bold[,2:20]

pooled_bold[is.na(pooled_bold)]=0 #where boldness is NA, zero is less confusing to RSiena (and will not be used either way because node will be structural zero)

		for (a in 1:dim(pooled_bold)[1]){ # Interpolation of boldness by experiment day. a is individual spider
			
		x_out=c(1,3,5,8,10,12,15,17,19,22,24,26,29,31,33,36,38,40,43) #list of days we want output for
		x=c(1,8,15,22,29,36,43) #list of days we actually have
		y=pooled_bold[a,c(1,4,7,10,13,16,19)] #list of boldness measurements that we have
			
		pooled_bold[a,]=approx(x,y,x_out)$y
		}

for (a in 1:dim(pooled_bold)[1]){  #put 0-600 boldness on 1-3 scale
	for (b in 1:dim(pooled_bold)[2]){

		if (pooled_bold[a,b] <= 200){pooled_bold[a,b]=1}
		if (pooled_bold[a,b] > 200 & pooled_bold[a,b] <= 400){pooled_bold[a,b]=2}
		if (pooled_bold[a,b] > 400){pooled_bold[a,b]=3}
		
	}
}

spider_cuddles=sienaDependent(pooled_series,allowOnly=FALSE) 
spider_boldness=sienaDependent(pooled_bold,type="behavior",allowOnly=FALSE)

data_siena=sienaDataCreate(spider_cuddles,spider_boldness) #create RSiena data object combining the network and behaviour data
siena_algm=sienaAlgorithmCreate(projname=namep) #default RSiena algorithm settings, specified output project file name

#		reportname=paste("data_s",s,"_t",t,sep="")      #the print01Report function provides a useful summary of the data, particular interest in the Jaccard index of tie turnover > 0.2
#		print01Report(data_siena,modelname=reportname)
#		

network_effects=getEffects(data_siena) #getEffects gets the default set of effects from the data

#effectsDocumentation(network_effects) #effectsDocumentation can be run on Windows/Mac (not the cluster) to open an HTML file listing all possible effects for the data provided

network_effects$include[26]=FALSE #turn off the default transitive triads effect
#network_effects=includeEffects(network_effects,transTriads,name="spider_cuddles",type="eval",include=FALSE) #this is the proper way of doing the above

#add in the effects of interest/necessary for good convergence. Probably do not want to do all at once before estimating model for the first time.
network_effects=includeEffects(network_effects,Jout,name="spider_cuddles",type="eval") #Jaccard similarity
network_effects=includeEffects(network_effects,egoPlusAltX,name="spider_cuddles",type="eval",interaction1="spider_boldness") #boldness on network, egoX + altX for undirected
network_effects=includeEffects(network_effects,avAlt,name="spider_boldness",type="eval",interaction1="spider_cuddles") #effect network on bold

network_effects=includeEffects(network_effects,degPlus,name="spider_cuddles",type="eval") #inPop + outAct for undirected
network_effects=includeEffects(network_effects,from.w.ind,name="spider_cuddles",type="eval",interaction1="spider_cuddles",interaction2="spider_cuddles")

#Run the RSiena model estimation model. 
# batch=TRUE turns off the GUI for use on the cluster (or otherwise - not especially useful to see)
# verbose=TRUE gives running update of what the algorithm is doing
# returnDeps=TRUE saves the simulated networks for use in the post-hoc GOF tests. This makes the file larger (e.g. around 20MB)
# useCluster=TRUE is for running on multiple CPUs
# nbrNodes specifies the number of CPUs to use. For a laptop, commonly 2 available
RS_model_t0 <- siena07(siena_algm, data=data_siena, effects=network_effects, batch=TRUE,verbose=TRUE,returnDeps=TRUE,useCluster=TRUE,nbrNodes=cpus)

#Once a model has been estimated, the network_effects object, which contains the values for each of the effects specified, can be updated to
# have new values based on the model output (provided it's going in the right direction - may need to try again)
#The below is not set up for running in a loop, but rather manual iteration:

network_effects=updateTheta(network_effects,RS_model_t0)

#Sometimes the period-specific rate constants (network or boldness, more commonly the former) can diverge from e.g. 6,7 to 100+
#This can be fixed by setting (e.g.) network_effects$fix[1]=TRUE for period 1, and set (e.g.) network_effects$initialValue[1]=20
#i.e. a cap of 20 for such network constants, which allows the rest of the estimation to proceed smoothly. Once a good overall convergence level
#has been achieved, the fix may be removed, e.g. network_effects$fix[1]=FALSE to see whether siena07 is now capable of finding a more
#meaningful value for that constant. 
#the sienaAlgorithm can be made to run for longer. n3=6000, nsub=6 is recommded in the RSiena manual for publishable results. 
#but not necessary for the first few runs - do when near a final result
siena_algm <- sienaAlgorithmCreate(projname=namep,cond=FALSE,n3=6000,nsub=6)

#One may wish to create a new RSiena model object rather than over-writing the previous one. In this way, one can make incremental changes between 
# a good model ('RS_model_t0') and an iterated model ('RS_model_t1') which may or may not turn out to be an improvement
RS_model_t1 <- siena07(siena_algm, data=data_siena, effects=network_effects, batch=TRUE,verbose=TRUE,returnDeps=TRUE,useCluster=TRUE,nbrNodes=cpus)

#save.image(name_save) #if saving the estimated model before moving onto others

#		}}} #close brackets if being done for the whole dataset (need to fix final model estimation step first)

#Once a good model has been obtained, run the GOF tests. These are hard to parallelise and run slowly (proportional to n3= x)

s.gof1a<-sienaGOF(RS_model_t0,verbose=TRUE,OutdegreeDistribution,varName="spider_cuddles")
s.gof1b<-sienaGOF(RS_model_t0,verbose=TRUE,GeodesicDistribution,varName="spider_cuddles")
s.gof1c<-sienaGOF(RS_model_t0,verbose=TRUE,TriadCensus,varName="spider_cuddles")
s.gof1d<-sienaGOF(RS_model_t0,verbose=TRUE,BehaviorDistribution,varName="spider_boldness")

#Sufficient goodness of fit is indicated by p>0.01
#The test results can be plotted, with scale=TRUE,center=TRUE sometimes useful for readability
#plot(s.gof1c,scale=TRUE,center=TRUE)

#Once models are obtained, can perform meta-analysis
#If for example, the models are saved in separate files of format s1t1_final.RData

final_models=list()

n=1
for (s in 1:3){                  #source colony
    for (t in 1:3){              #treatment: t1=allbold, t2=keystone, t3=allshy
        if (!((s==3) & (t==1))){ #necessary because s3t1 does not exist
            
            final_models[[n]]=list()
            namep=paste("s",s,"t",t,"_final.RData",sep="")
            load(namep)
            final_models[[n]]=RS_model_t0 #this is the RSiena model object
            n=n+1
        }}}

#siena08 is RSiena's function for performing meta-analysis
meta=siena08(final_models[[1]],final_models[[2]],final_models[[3]],final_models[[4]],
final_models[[5]],final_models[[6]],final_models[[7]],final_models[[8]],
maxit=100)
meta #call meta object to display results