############################################################################################## # # Filtering function by pair of OTU prevalence # for abundance data and for presence /absence data # # INRA Theix, le 15/11/2018 # # Rarity of microbial species: In search of reliable associations # PLOS ONE # # Arnaud Cougoul, Xavier Bailly , Gwenaël Vourch, Patrick Gasqui # ############################################################################################## ############################################################################################## # Filtering function for abundance data # otu_table: a matrix of count data with samples per row and species per column # risk: alpha level for testability # return a matrix whose values are equal to 1 for OTU pairs fully testable and 0 if not. filter_by_pair <- function (otu_table, risk=0.05){ prevalence = colMeans (otu_table>0) N = dim(otu_table)[1] d = dim(otu_table)[2] tt = qt(1-risk/2, N-2, lower.tail = TRUE) K = (tt^2)/(N-2+tt^2) f1<-function(x){(1-x)/(1+((1-K)/K)*x)} res <- matrix(0,ncol=d,nrow=d) for (i in 1:(d-1)){ for (j in (i+1):d){ if(f1(prevalence[i]) < prevalence[j]) res[i,j]<-1 } } res <- res+t(res) diag(res)=1 res } ############################################################################################## # Filtering function for presence/absence data # otu_table: a matrix of count data with samples per row and species per column # risk: alpha level for testability # return a matrix whose values are equal to 1 for OTU pairs fully testable and 0 if not. filter_by_pair_binary <- function (otu_table, risk=0.05){ prevalence = colMeans (otu_table>0) N = dim(otu_table)[1] d = dim(otu_table)[2] B=qchisq(1-risk,df=1) K=B/N tt = qt(1-risk/2, N-2, lower.tail = TRUE) K = (tt^2)/(N-2+tt^2) f1<-function(x) {(1-x)/(1+((1-K)/K)*x)} f2<-function(x) {(-1+x)/(-1+(1-K)*x)} f3<-function(x) {x/(K+(1-K)*x)} f4<-function(x) {x/(1/K+((K-1)/K)*x)} res1 <- matrix("Fully_testable",ncol=d,nrow=d) diag(res1)<- NA res2 <- res1 for (i in 1:(d-1)){ for (j in (i+1):d){ if( f1(prevalence[i]) > prevalence[j] | f2(prevalence[i]) < prevalence[j]) res1[i,j]<- "Positive_only" if( f3(prevalence[i]) < prevalence[j] | f4(prevalence[i]) > prevalence[j]) res2[i,j]<- "Negative_only" } } res1[res1=="Positive.only"&res2=="Negative_only"] = "Non_testable" res1[lower.tri(res1)] <- t(res1)[lower.tri(res1)] res1 }