##############################################################################################
# 
# Filtering function by pair of OTU prevalence 
# for abundance data and for presence /absence data
#
# INRA Theix, le 15/11/2018
# 
# Rarity of microbial species: In search of reliable associations
# PLOS ONE
# 
# Arnaud Cougoul, Xavier Bailly , Gwenaël Vourch, Patrick Gasqui
#
##############################################################################################


##############################################################################################
# Filtering function for abundance data
# otu_table: a matrix of count data with samples per row and species per column
# risk: alpha level for testability
# return a matrix whose values are equal to 1 for OTU pairs fully testable and 0 if not.

filter_by_pair <- function (otu_table, risk=0.05){
  
  prevalence = colMeans (otu_table>0)
  N = dim(otu_table)[1]
  d = dim(otu_table)[2]
  
  tt = qt(1-risk/2, N-2, lower.tail = TRUE)
  K = (tt^2)/(N-2+tt^2)
  
  f1<-function(x){(1-x)/(1+((1-K)/K)*x)}
  
  res <- matrix(0,ncol=d,nrow=d)
  for (i in 1:(d-1)){
    for (j in (i+1):d){
      if(f1(prevalence[i]) < prevalence[j]) res[i,j]<-1
    }
  } 
  
  res <- res+t(res)
  diag(res)=1
  res
}


##############################################################################################
# Filtering function for presence/absence data
# otu_table: a matrix of count data with samples per row and species per column
# risk: alpha level for testability
# return a matrix whose values are equal to 1 for OTU pairs fully testable and 0 if not.

filter_by_pair_binary <- function (otu_table, risk=0.05){
  
  prevalence = colMeans (otu_table>0)
  N = dim(otu_table)[1]
  d = dim(otu_table)[2]
  
  B=qchisq(1-risk,df=1)
  K=B/N
  
  tt = qt(1-risk/2, N-2, lower.tail = TRUE)
  K = (tt^2)/(N-2+tt^2)
  
  f1<-function(x) {(1-x)/(1+((1-K)/K)*x)}
  f2<-function(x) {(-1+x)/(-1+(1-K)*x)}
  f3<-function(x) {x/(K+(1-K)*x)}
  f4<-function(x) {x/(1/K+((K-1)/K)*x)}
  
  res1 <- matrix("Fully_testable",ncol=d,nrow=d)
  diag(res1)<- NA
  res2 <- res1
  
  for (i in 1:(d-1)){
    for (j in (i+1):d){
      if( f1(prevalence[i]) > prevalence[j] | f2(prevalence[i]) < prevalence[j]) res1[i,j]<- "Positive_only"
      if( f3(prevalence[i]) < prevalence[j] | f4(prevalence[i]) > prevalence[j]) res2[i,j]<- "Negative_only"
    }
  } 
  
  res1[res1=="Positive.only"&res2=="Negative_only"] = "Non_testable"
 
  res1[lower.tri(res1)] <-  t(res1)[lower.tri(res1)]
  res1
}