########################################################################################################
#######################       Behavioural inference from signal processing       ####################### 
###################                 animal-borne multi-sensor loggers                ###################
#############                           Jeantet et al.   2020                              #############      


#Loading of the required R-packages

library(h5) #package to open the dataset
library(caTools) # package to apply the runmean function
library(changepoint) #package to apply the PELT algorithm
library(signal) #package to apply a smoothing filter


library(MASS) #package of LDA 
library(rpart) #package of CART
library(rpart.plot) #package to plot the decision tree of the CART algorithm
library(xgboost) # package of Extreme Gradient Boosting 
library(randomForest) #package of Random Forest 
library(e1071) #package of SVM 


#### Data downloading


#The data have to be downloaded from the DOI : https://doi.org/10.5061/dryad.hhmgqnkd9
#it is recommended to put all the behaviour files in the same folder named "Behaviour" 
#and the raw data files in h5 format in the same folder named "Acceleration_h5".


#personal working directory containing the downloaded data
setwd("~/Martinique/Analyse") 

#names of the folders containing the Behaviour and h5 files
fileBehaviours<-dir("Behaviour", full.names = T)
fileh5s<-dir("Acceleration h5", full.names=T)

########################### Visualisation #################################

##installation of rblt package
install.packages("devtools")
devtools::install_github("sebastien45/rblt")

library(rblt)

#To visualise the raw acceleration and gyroscope signals WITHOUT correction of one individual
i=1 #choose the individual
fileBehaviour<-fileBehaviours[i]
fileh5<-fileh5s[i]

#Visualisation
ll=LoggerList$new()
ll$add(LoggerData$new(fileh5, filebehavior=fileBehaviour,besep=";",besaturation=0.7))
ui=LoggerUI$new(ll)
ui$gui()

###########################         Part   I         ############################
########### Calculation of descriptive statistics from the raw data   #################


Model_dive<-list() #creates a list to contain the descriptive statistics of the diving behaviours
Model_surf<-list() #creates a list to contain the data of surface behaviours 

Dict_behav<-list() #creates a list to contain the associated number of each behaviour

freq<-20  #frequence of the data

#For each individual, we separate the surface behaviours from the diving ones and calculate the descriptive statistics of the diving behaviours
for(i in 1:16){
  
  fileh5<-fileh5s[i]
  fileBehaviour<-fileBehaviours[i]
  print(c(fileh5,i))
  
  Tortue<-h5read(fileh5,fileBehaviour,freq) #opens the file containing the raw data and synchronises the behaviours 
  Tortue<-as.data.frame(cbind(Tortue[[1]],Tortue[[2]])) #combines the raw data and the synchronised behaviours
  
   
  ###Correction of the acceleration and angular speed titlt : the correction is done according to raw data when the animal is inactive laying flat
    
  if (i==1){Tortue<-correction(Tortue,1)}
  if (i==2){Tortue<-correction(Tortue,1)}
  if (i==3){Tortue<-correction(Tortue,1)}
  if (i==4){Tortue<-correction(Tortue,1)}
  if (i==5){Tortue<-correction(Tortue,20950)}
  if (i==6){Tortue<-correction(Tortue,34580)}
  if (i==7){Tortue<-correction(Tortue,4440)}
  if (i==11){Tortue<-correction(Tortue,19060)}
  if (i==12){Tortue<-correction(Tortue,34440)}
  if (i==13){Tortue<-correction(Tortue,1)}
  if (i==14){Tortue<-correction(Tortue,16050)}
  if (i==15){Tortue<-correction(Tortue,13400)}
  if (i==16){Tortue<-correction(Tortue,48550)}

  ###Computation of the Dynamic Body Acceleration (DBA) and Rotational Activity (RA)
  X<-Data_computation(Tortue[,1:6],2,freq)
  
  Tortue<-cbind(Tortue[,1:6],X,Tortue[,7:9])
  rm(X)
  
  #Keeping only the labeled data
  Tortue<-cut_behav_first(Tortue,freq,10) #Removes the unlabeled raw data 10 seconds before the first behavioural label
  Tortue<-cut_behav_fin(Tortue,freq,30) #Removes the unlabeled raw data 30 seconds after the last behavioural label  
  
  #Separation of the surface behaviours from the diving behaviours and numbering of the dives  
  Tortue$Depth=-Tortue$Depth
  Tortue$num<- ceiling(cumsum(abs(c(0, diff(Tortue$Depth>0.3))))/2)
  Tortue$num[Tortue$Depth<0.3]<-0
  n.dive<-max(Tortue$num)
  print(n.dive)
  
  #Clustering the behaviours by category
  Tortue<-Selection(Tortue,11)
  
  #Changing the behaviours by a number to facilitate the processing
  #Recording the associated number of each behaviour
  Dict_behav[[i]]<-levels(as.factor(Tortue$Behaviour))     
  Tortue$Behaviour<-as.numeric(as.factor(Tortue$Behaviour)) 
  n.behav<-length(Dict_behav[[i]]) 
  
  #Segmentation and calculation of the descriptive statistics 
  Tortue_p<-Predictors_computation(Tortue,n.dive,freq,Dict_behav[[i]],n.behav)

  #Storage of the predictors and indication of the corresponding individual's number (as several files can correspond to one individual)
  if(i<3){j=i}
  if(i==3 | i==4){j=3}  
  if(i>4 & i<8){j=i-1}
  if(i==8 | i==9 | i==10){j=7}
  if(i>10){j=i-3}
  
  Model_dive[[i]]<-cbind(j,Tortue_p[[1]]) #stores the predictors of the diving behaviours with the associated individual's number.
  Model_surf[[i]]<-cbind(j,Tortue_p[[2]][Tortue_p[[2]]$num==0,]) ##stores the surface periods with the associated individual's number (depth<0.3m or dive_duration<5s).
}

rm(Tortue,Tortue_p,nbrow,n.dive,i,j,freq,fileh5,fileh5s,fileBehaviour,fileBehaviours,datestart)


#Replace the number by the associated behaviour
for (i in 1:length(Model_dive)){
  Predictors<-as.data.frame(Model_dive[[i]])
  for (j in 1:length(Dict_behav[[i]])){
    Predictors[Predictors$Behaviour==j,4]=Dict_behav[[i]][j]
  }
  Predictors[Predictors$Behaviour==(length(Dict_behav[[i]])+1),4]<-"Transition"
  Model_dive[[i]]<-Predictors
  rm(Predictors)
}



#Combining Model_dive by individual 
Model_dive[[3]]<-rbind(Model_dive[[3]],Model_dive[[4]])
Model_dive[[4]]<-Model_dive[[5]]
Model_dive[[5]]<-Model_dive[[6]]

Model_dive[[6]]<-Model_dive[[7]]
Model_dive[[7]]<-rbind(Model_dive[[8]],Model_dive[[9]],Model_dive[[10]])

for (j in 8:13){
  Model_dive[[j]]<-Model_dive[[j+3]]
}
Model_dive<-Model_dive[1:13]

#Combining Model_surf by individual
Model_surf[[3]]<-rbind(Model_surf[[3]],Model_surf[[4]])
Model_surf[[4]]<-Model_surf[[5]]
Model_surf[[5]]<-Model_surf[[6]]

Model_surf[[6]]<-Model_surf[[7]]
Model_surf[[7]]<-rbind(Model_surf[[8]],Model_surf[[9]],Model_surf[[10]])

for (j in 8:13){
  Model_surf[[j]]<-Model_surf[[j+3]]
}
Model_surf<-Model_surf[1:13]


###############################               Part   II                 ##################################
############# Prediction of diving behaviours using supervised machine learning algorithms #################

###Selection of the combinations in which "Feeding" and "Scratching" were not under-represented in the training dataset 
#i.e. occur more than 60% of their respective total number of segments

size<-choose(13,4) #selectes all the possible combinations

train<-do.call(rbind, Model_dive) #combines all the training segments
total_feeding<-Freq(train[,"Behaviour"])[1] #retrieves the total number of the "Feeding" segments
total_scratching<-Freq(train[,"Behaviour"])[5] #retrieves the total number of the "Scratching" segments
name.behav<-unique(train[,"Behaviour"])


index<-c() #to store all the combinations in which "Feeding" and "Scratching" were not under-represented

#For each combinations, we check if "Feeding" and "Scratching" occur more than 60% of their respective total number of segments
for (k in 1:size){
  
  test_set<-combn(13,4)[,k]
  training_set=seq(13)[-test_set]
  train<-do.call(rbind, Model_dive[training_set])
  test<- do.call(rbind, Model_dive[test_set])
  
  
  test_ind<-test[,c(1:3,5:6)]
  train<-train[,-c(1:3)]
  test<-test[,-c(1:3)] 
  
  X<-Freq(train[,"Behaviour"])
  
  
  if (X[1]>(0.60*total_feeding) & X[5]>(0.60*total_scratching)){index<-c(index,k)}
}

###Creation of the data frames to store the results

n<-length(index)

vec_names<-c("CART","SVM","LDA","EGB","RF","VE","WS")
vec_index<-c("Accuracy","Sensitivity","Precision","Specificity")


#Data frame to store the accuracy of each classifier
table_accuracy=data.frame(matrix(0, ncol = 7))
colnames(table_accuracy)=vec_names

#List to store the indicators of accuracy of each classifier for each behaviour
Behaviour<-list()
temp=matrix(1:28, ncol=7)

for (i in 1:7){
  Behaviour[[i]]<-data.frame(matrix(0, nrow =n, ncol = 28)) 
  for (j in 1:7){
    colnames(Behaviour[[i]])[temp[,j]]<-paste(vec_names[j], vec_index, sep="_")
  }
}


#Data frame to store the time budgets
budget_WS<-list()
for(i in 1:13){
  budget_WS[[i]]<-data.frame(matrix(0,nrow=n, ncol=7))
  colnames(budget_WS[[i]])<-c("Feeding","Gliding","Resting","Scratching","Swimming","Transition", "Other")
}

w<-1 #compteur

#####Let's go 
##For each selected combinations, we train and test the seven classifiers and record the time budgets of the tested individuals

for (k in index){
  
  print(w)
  
  #Getting back the index of the individuals used for the training set and those kept for the test set.
  test_set<-combn(13,4)[,k]
  training_set=seq(13)[-test_set]
  
  #Combining the data of the individuals into one training set and one test set according to the k-th combination
  train<-do.call(rbind, Model_dive[training_set])
  test<- do.call(rbind, Model_dive[test_set])
  
  test_ind<-test[,c(1:3,5)]
  train<-train[,-c(1:3)]
  test<-test[,-c(1:3)] 
  
  train$Behaviour<-as.factor(train$Behaviour)
  test$Behaviour<-as.factor(test$Behaviour)
  
  #Calculation of the number of segments associated with each behaviour (not essential)
  Freq(train$Behaviour)
  Freq(test$Behaviour)

  nclass<-length(unique(train$Behaviour))
  name.behav<-levels(train$Behaviour)
  
  ### Balancing the training dataset
  #we randomly selected 1000 segments for the over-expressed categories

  n.col=length(train)
  transition<-data.frame()
  g<-data.frame(matrix(ncol=(n.col)+1))
  train$num<-as.numeric(train$Behaviour)
  colnames(g)<-colnames(train)
  
  for (i in 1:length(unique(train$num))){
    transition<-train[train$num==i,]
    n<-length(transition$num)
    if (n>1000){
      alea<-runif(n)
      rang<-rank(alea)
      rang<-rang[1:1000]
      transition<-transition[rang,]
    }
    g<-rbind(g,transition)
  }
  
  g<-na.omit(g)
  train<-g[,-44]
  rm(g,transition)
  train$Behaviour<-as.factor(train$Behaviour)
  test$Behaviour<-as.factor(test$Behaviour)
  
  
  ### Prediction of the diving behaviours by the supervised machine learning algorithms.
  
  #For each algorithm :
  #the associated function builds a model from "train" and predicts the behaviours of "test"
  #the prediction of the test dataset is returned and stored in "pred_" for the ensemble classifiers
  #From the confusion matrix, the function Accuracy calculates the accuracy of the algorithms as well as the indicators Precision, Sensitivity and Specificity associated with each behaviour. 

  
  #Gradient Boosting
  cf_EGB<-fExtreme_Gradient_Boosting(train,test,nclass)
  pred_EGB<-(cf_EGB[[2]])+1
  Gradient<-Accuracy(cf_EGB[[1]],nclass,name.behav)
  
  #Random Forest
  cf_RF<-fRandom_forest(train,test,14)
  pred_RF<-cf_RF[[2]] 
  Random<-Accuracy(cf_RF[[1]],nclass,name.behav)

  #CART
  cf_CART<-fCART(train,test)
  pred_CART<-cf_CART[[2]]
  CART<-Accuracy(cf_CART[[1]],nclass,name.behav)
  
  #Support Vector Machine
  cf_SVM<-fSVM(train,test)
  pred_SVM<-cf_SVM[[2]]
  SVM<-Accuracy(cf_SVM[[1]],nclass,name.behav)

  #Linear Discriminant Analysis
  cf_LDA<-fLDA(train,test)
  pred_LDA<-cf_LDA[[2]]
  LDA<-Accuracy(cf_LDA[[1]],nclass,name.behav)
  
  
  #Storage and comparison of the accuracies of each algorithm  
  accuracy<-c(CART[[2]],SVM[[2]],LDA[[2]],Gradient[[2]],Random[[2]])
  print(accuracy)
  
  ### Creation of the Ensemble classifiers : Voting Ensemble and Weighted SUm
  
  #Combines the predictions (in a numeric format) of the 5 algorithms to the observed behaviours   
  Prediction<-as.numeric(as.factor(test$Behaviour))
  Prediction<-cbind(Prediction,pred_CART,pred_SVM,pred_LDA,pred_EGB,pred_RF)
  Prediction<-as.data.frame(Prediction)
  
  n<-length(Prediction[,1])
  
  ##The Weighted Sum
  # Combines the Precison values of each behaviour of each algorithm into a matrix 
  Precision<-rbind(CART[[1]][,7],SVM[[1]][,7],LDA[[1]][,7],Gradient[[1]][,7],Random[[1]][,7])
  Precision[is.na(Precision)]<- 0
  Precision<-as.matrix(Precision)

  pred_WS<-c() #creation of the vector containing the predictions based on the ensemble method Weighted Sum
  
  #for each segment (corresponding to each row)   
  for (i in 1:n){
    
    pred<-as.numeric(Prediction[i,-1]) # the predictions of the five algorithms of the i-th segment
    
    #Detects which behaviour is predicted according to the algorithms     
    P=1*outer(1:5,1:8,function (a,b) pred[a]==b)
    P<-t(P)
    P<-as.matrix(P)
    
    
    #Calculates the weight of each behaviour according to the Precision indicator and the associated algorithm. 
    Pred<-P%*%Precision
    a<-diag(Pred)
    
    #Calculates how many time each behaviour is predicted by the five algorithms for the i-th segment 
    sum<-apply(P,1,sum)
    
    #If some behaviours got the same weight, we refer to how many time they have been predicted by the five algorithms
    if (length(which(a==max(a)))<2){pred_WS<-c(pred_WS,which(a==max(a))) 
    }else if (length(which(sum==max(sum) & (a==max(a))))>0){pred_WS<-c(pred_WS,which(sum==max(sum) & (a==max(a))))
    }else{pred_WS<-c(pred_WS,which(sum==max(sum)))}
    #print(c(i,length(pred_WS)))  
  }
  
  Prediction<-cbind(Prediction,pred_WS) #Adds the prediction of the WS method 
  
  ##The Voting Ensemble
  
  #for each segment  
  for (i in 1:n){
    pred<-as.numeric(Prediction[i,c(2:6)]) #gets back the prediction of the five algorithms for the i-th segment
    Prediction[i,8]<-as.numeric(names(sort(table(pred),decreasing=TRUE)[1])) #selects the behaviour the most predicted by the five algorithms 
  }
  
  colnames(Prediction)[8]<-"pred_VE"
  
  ##Rename of the predictions of the 7 classifiers and the observed behaviours.
  #The observed behaviours  
  n<-length(levels(test$Behaviour))
  for (i in 1:n){Prediction[Prediction$Prediction==i,1]<-levels(test$Behaviour)[i]}
 
  #The predicted behaviours    
  n<-length(levels(pred_RF))
  for (i in 1:n){
    
    Prediction[Prediction$pred_CART==i,2]<-levels(pred_CART)[i]
    Prediction[Prediction$pred_SVM==i,3]<-levels(pred_SVM)[i]
    Prediction[Prediction$pred_LDA==i,4]<-levels(pred_LDA)[i]
    Prediction[Prediction$pred_EGB==i,5]<-levels(pred_RF)[i]
    Prediction[Prediction$pred_RF==i,6]<-levels(pred_RF)[i]
    Prediction[Prediction$pred_WS==i,7]<-levels(pred_RF)[i]
    Prediction[Prediction$pred_VE==i,8]<-levels(pred_RF)[i]
  }
  
  
  #Creation of the confusion matrix of the Ensemble learners
  #From the confusion matrix, the function Accuracy calculates the accuracy of the Ensemble learners as well as the indicators Precision, Sensitivity and Specificity associated with each behaviour. 
  cf_VE<-table(Prediction$pred_VE,test$Behaviour)
  cf_WS<-table(Prediction$pred_WS,test$Behaviour)
  Voting<-Accuracy(cf_VE,nclass,name.behav)
  Weighted<-Accuracy(cf_WS,nclass,name.behav)
  
  #Comparison of the accuracies of each method    
  accuracy_full<-c(accuracy,Voting[[2]],Weighted[[2]])
  print(accuracy_full)
  
  #Storage of the accuracies associated with the k-th combination.  
  table_accuracy[w,]=accuracy_full
  
  
  #Storage of the four indicators (Accuracy,Precision, Sensitivity, Specificity) of each behaviour of each classifier associated with the k-th combination.    
  for (g in 1:7){
    Behaviour[[g]][w,1:4]<-CART[[1]][g,5:8]
    Behaviour[[g]][w,5:8]<-SVM[[1]][g,5:8]
    Behaviour[[g]][w,9:12]<-LDA[[1]][g,5:8]
    Behaviour[[g]][w,13:16]<-Gradient[[1]][g,5:8]
    Behaviour[[g]][w,17:20]<-Random[[1]][g,5:8]
    Behaviour[[g]][w,21:24]<-Voting[[1]][g,5:8]
    Behaviour[[g]][w,25:28]<-Weighted[[1]][g,5:8]
  }
  
  #Storage of the time budgtes of the tested individuals 
  for (l in test_set){
    individu<-test_ind[,1]==l
    prediction<-cbind(test_ind[individu,],pred_WS[individu])  
    colnames(prediction)[5]<-"Pred_WS"

    for (i in 1:7){prediction[prediction$Pred_WS==i,5]<-levels(test$Behaviour)[i]}
    
    budget_WS[[l]][w,"Feeding"]<-sum(prediction[prediction$Pred_WS=="Feeding",4])
    budget_WS[[l]][w,"Gliding"]<-sum(prediction[prediction$Pred_WS=="Gliding",4])
    budget_WS[[l]][w,"Resting"]<-sum(prediction[prediction$Pred_WS=="Resting",4])
    budget_WS[[l]][w,"Scratching"]<-sum(prediction[prediction$Pred_WS=="Scratching",4])
    budget_WS[[l]][w,"Swimming"]<-sum(prediction[prediction$Pred_WS=="Swimming",4])
    budget_WS[[l]][w,"Transition"]<-sum(prediction[prediction$Pred_WS=="Transition",4])
    budget_WS[[l]][w,"Other"]<-sum(prediction[prediction$Pred_WS=="Other",4])

  }
  
  w<-w+1
  
} 

###############################               Part   III                ##################################
############# Identification of the surface behaviours and calculation of the time budget by individual #################

#Calculation of the mean time budget of the 13 individuals from the selected combinations

freq=20
Budget_WS<-list()
for (i in 1:13){Budget_WS[[i]]<-Budget_fun(budget_WS,i,mean)}
Budget_WS<-do.call(rbind,Budget_WS)
Budget=as.data.frame(matrix(nrow=13,ncol=9))


#Identification of the surface behaviours of each individual
for (i in 1:13){
  
  Surface<-Model_surf[[i]] #gets back the surface periods of the individual i
  
  #Calculates the duration of the surface periods 
  #Associates the behaviour "Breathing" or "Stay at surface" to the surface periods according to the duration
  surface_behaviours<-Prediction_surface(Surface,freq)
  
  #Calculates the total time allocated to both behaviours
  budget_breath<-sum(surface_behaviours[surface_behaviours$Pred=="Breathing",3])
  budget_stay<-sum(surface_behaviours[surface_behaviours$Pred=="Staying at the surface",3])
  
  #Combines the diving and surface time budgets 
  Budget[i,1:7]<-Budget_WS[i,]
  Budget[i,8]<-budget_breath
  Budget[i,9]<-budget_stay
}
colnames(Budget)[1:7]<-colnames(Budget_WS)
colnames(Budget)[8:9]<-c("Breathing","Staying at the surface")