Supplemental R code 2: R code adapted from Shade & Stopnisek (2019) to determine core ASVs. # Save file as .R file and open in R or Rstudio # Required data can be found at https://figshare.com/articles/dataset/ta-object_leaf_side/21524538 # R code used for determining core ASVs, adapted from Shade, Ashley, and Nejc Stopnisek. "Abundance-occupancy distributions to prioritize plant core microbiome membership." Current opinion in microbiology 49 (2019): 50-58. library(reshape2) library(tidyamplicons) library(tidyverse) theme_set(theme_light()) setwd("H:/Postdoc/Research/Publications/pH project") set.seed(334) #----------------------------------------------------------------------------------------- load("runclean.Rdata") nReads=4000 # input dataset needs to be rarified and the rarifaction depth included runf <- run %>% filter_abundances(abundance > 2)%>% add_lib_size()%>% filter_samples(lib_size > (nReads-1))%>% rarefy(nReads)%>% add_total_rel_abundance()%>% add_rel_occurrence() otu <- t(as_abundances_matrix(runf$abundances)) map <- runf$samples #otu <- readRDS('switchgrassOTUtable.rds') #map <- readRDS('switchgrassMAPtable.rds') library(vegan) otu_PA <- 1*((otu>0)==1) # presence-absence data otu_occ <- rowSums(otu_PA)/ncol(otu_PA) # occupancy calculation otu_rel <- apply(decostand(otu, method="total", MARGIN=2),1, mean) # mean relative abundance occ_abun <- add_rownames(as.data.frame(cbind(otu_occ, otu_rel)),'otu') # combining occupancy and abundance data frame PresenceSum <- data.frame(otu = as.factor(row.names(otu)), otu) %>% gather(sample_id, abun, -otu) %>% left_join(map, by = 'sample_id') %>% group_by(otu, Plant_sample) %>% summarise(site_freq=sum(abun>0)/length(abun), # frequency of detection coreSite=ifelse(site_freq == 1, 1, 0)) %>% # 1 only if occupancy 1 with specific time, 0 if not group_by(otu) %>% summarise(sumF=sum(site_freq), sumG=sum(coreSite), nS=length(Plant_sample)*2, Index=(sumF+sumG)/nS) otu_ranked <- occ_abun %>% left_join(PresenceSum, by='otu') %>% transmute(otu=otu, rank=Index) %>% arrange(desc(rank)) # Calculating the contribution of ranked OTUs to the BC similarity BCaddition <- NULL # calculating BC dissimilarity based on the 1st ranked OTU otu_start=otu_ranked$otu[1] start_matrix <- as.matrix(otu[otu_start,]) start_matrix <- t(start_matrix) x <- apply(combn(ncol(start_matrix), 2), 2, function(x) sum(abs(start_matrix[,x[1]]- start_matrix[,x[2]]))/(2*nReads)) x_names <- apply(combn(ncol(start_matrix), 2), 2, function(x) paste(colnames(start_matrix)[x], collapse=' - ')) df_s <- data.frame(x_names,x) names(df_s)[2] <- 1 BCaddition <- rbind(BCaddition,df_s) # calculating BC dissimilarity based on additon of ranked OTUs from 2nd to 500th. Can be set to the entire length of OTUs in the dataset, however it might take some time if more than 5000 OTUs are included. for(i in 2:500){ otu_add=otu_ranked$otu[i] add_matrix <- as.matrix(otu[otu_add,]) add_matrix <- t(add_matrix) start_matrix <- rbind(start_matrix, add_matrix) x <- apply(combn(ncol(start_matrix), 2), 2, function(x) sum(abs(start_matrix[,x[1]]-start_matrix[,x[2]]))/(2*nReads)) x_names <- apply(combn(ncol(start_matrix), 2), 2, function(x) paste(colnames(start_matrix)[x], collapse=' - ')) df_a <- data.frame(x_names,x) names(df_a)[2] <- i BCaddition <- left_join(BCaddition, df_a, by=c('x_names')) } # calculating the BC dissimilarity of the whole dataset (not needed if the second loop is already including all OTUs) x <- apply(combn(ncol(otu), 2), 2, function(x) sum(abs(otu[,x[1]]-otu[,x[2]]))/(2*nReads)) x_names <- apply(combn(ncol(otu), 2), 2, function(x) paste(colnames(otu)[x], collapse=' - ')) df_full <- data.frame(x_names,x) names(df_full)[2] <- length(rownames(otu)) BCfull <- left_join(BCaddition,df_full, by='x_names') rownames(BCfull) <- BCfull$x_names temp_BC <- BCfull temp_BC$x_names <- NULL temp_BC_matrix <- as.matrix(temp_BC) BC_ranked <- data.frame(rank = as.factor(row.names(t(temp_BC_matrix))),t(temp_BC_matrix)) %>% gather(comparison, BC, -rank) %>% group_by(rank) %>% summarise(MeanBC=mean(BC)) %>% # mean Bray-Curtis dissimilarity arrange(desc(-MeanBC)) %>% mutate(proportionBC=MeanBC/max(MeanBC)) # proportion of the dissimilarity explained by the n number of ranked OTUs Increase=BC_ranked$MeanBC[-1]/BC_ranked$MeanBC[-length(BC_ranked$MeanBC)] increaseDF <- data.frame(IncreaseBC=c(0,(Increase)), rank=factor(c(1:(length(Increase)+1)))) BC_ranked <- left_join(BC_ranked, increaseDF) BC_ranked <- BC_ranked[-nrow(BC_ranked),] #Creating thresholds for core inclusion #Method: #A) Elbow method (first order difference) (script modified from https://pommevilla.github.io/random/elbows.html) fo_difference <- function(pos){ left <- (BC_ranked[pos, 2] - BC_ranked[1, 2]) / pos right <- (BC_ranked[nrow(BC_ranked), 2] - BC_ranked[pos, 2]) / (nrow(BC_ranked) - pos) return(left - right) } BC_ranked$fo_diffs <- sapply(1:nrow(BC_ranked), fo_difference) elbow <- which.max(BC_ranked$fo_diffs) #A2) Visual elbow method elbowvis1<-15 elbowvis2<-64 #B) Final increase in BC similarity of equal or greater then 2% lastCall <- last(as.numeric(BC_ranked$rank[(BC_ranked$IncreaseBC>=1.02)])) #Creating plot of Bray-Curtis similarity ggplot(BC_ranked[1:100,], aes(x=factor(BC_ranked$rank[1:100], levels=BC_ranked$rank[1:100]))) + geom_point(aes(y=proportionBC)) + theme_classic() + theme(strip.background = element_blank(),axis.text.x = element_text(size=7, angle=45)) + geom_vline(xintercept=elbow, lty=3, col='red', cex=.5) + geom_vline(xintercept=elbowvis1, lty=3, col='darkgreen', cex=.5) + geom_vline(xintercept=elbowvis2, lty=3, col='darkolivegreen 3', cex=.5) + geom_vline(xintercept=last(as.numeric(BC_ranked$rank[(BC_ranked$IncreaseBC>=1.02)])), lty=3, col='blue', cex=.5) + labs(x='ranked OTUs',y='Bray-Curtis similarity') + annotate(geom="text", x=elbow+9, y=.07, label=paste("Elbow method"," (",elbow,")", sep=''), color="red")+ annotate(geom="text", x=elbowvis1+9, y=.12, label=paste("Visual elbow 1"," (",elbowvis1,")", sep=''), color="darkgreen")+ annotate(geom="text", x=elbowvis2+9, y=.12, label=paste("Visual elbow 2"," (",elbowvis2,")", sep=''), color="darkolivegreen3")+ annotate(geom="text", x=last(as.numeric(BC_ranked$rank[(BC_ranked$IncreaseBC>=1.02)]))+3, y=.5, label=paste("Last 2% increase (",last(as.numeric(BC_ranked$rank[(BC_ranked$IncreaseBC>=1.02)])),")",sep=''), color="blue") #Creating occupancy abundance plot occ_abun$fill <- 'no' occ_abun$fill[occ_abun$otu %in% otu_ranked$otu[1:last(as.numeric(BC_ranked$rank[(BC_ranked$IncreaseBC>=1.02)]))]] <- 'core' #occ_abun$fill[occ_abun$otu %in% otu_ranked$otu[1:elbowvis1]] <- 'core' #Fitting neutral model (Burns et al., 2016 (ISME J) - functions are in the sncm.fit.R) #----------------------------------- #Adam Burns - 2/10/2015 #From Burns et al. Contribution of neutral processes to the assembly of the gut microbial communities changes over host development #Fits the neutral model from Sloan et al. 2006 to an OTU table and returns several fitting statistics. Alternatively, will return predicted occurrence frequencies for each OTU based on their abundance in the metacommunity when stats=FALSE. #spp: A community table for communities of interest with local communities/samples as rows and taxa as columns. All samples must be rarefied to the same depth. #pool: A community table for defining source community (optional; Default=NULL). #taxon: A table listing the taxonomic calls for each otu, with OTU ids as row names and taxonomic classifications as columns. #If stats=TRUE the function will return fitting statistics. #If stats=FALSE the function will return a table of observed and predicted values for each otu. sncm.fit <- function(spp, pool=NULL, stats=TRUE, taxon=NULL){ require(minpack.lm) require(Hmisc) require(stats4) options(warn=-1) #Calculate the number of individuals per community N <- mean(apply(spp, 1, sum)) #Calculate the average relative abundance of each taxa across communities if(is.null(pool)){ p.m <- apply(spp, 2, mean) p.m <- p.m[p.m != 0] p <- p.m/N } else { p.m <- apply(pool, 2, mean) p.m <- p.m[p.m != 0] p <- p.m/N } #Calculate the occurrence frequency of each taxa across communities spp.bi <- 1*(spp>0) freq <- apply(spp.bi, 2, mean) freq <- freq[freq != 0] #Combine C <- merge(p, freq, by=0) C <- C[order(C[,2]),] C <- as.data.frame(C) C.0 <- C[!(apply(C, 1, function(y) any(y == 0))),] #Removes rows with any zero (absent in either source pool or local communities) p <- C.0[,2] freq <- C.0[,3] names(p) <- C.0[,1] names(freq) <- C.0[,1] #Calculate the limit of detection d = 1/N ##Fit model parameter m (or Nm) using Non-linear least squares (NLS) m.fit <- nlsLM(freq ~ pbeta(d, N*m*p, N*m*(1-p), lower.tail=FALSE), start=list(m=0.1)) m.ci <- confint(m.fit, 'm', level=0.95) ##Fit neutral model parameter m (or Nm) using Maximum likelihood estimation (MLE) sncm.LL <- function(m, sigma){ R = freq - pbeta(d, N*m*p, N*m*(1-p), lower.tail=FALSE) R = dnorm(R, 0, sigma) -sum(log(R)) } m.mle <- mle(sncm.LL, start=list(m=0.1, sigma=0.1), nobs=length(p)) ##Calculate Akaike's Information Criterion (AIC) aic.fit <- AIC(m.mle, k=2) bic.fit <- BIC(m.mle) ##Calculate goodness-of-fit (R-squared and Root Mean Squared Error) freq.pred <- pbeta(d, N*coef(m.fit)*p, N*coef(m.fit)*(1-p), lower.tail=FALSE) Rsqr <- 1 - (sum((freq - freq.pred)^2))/(sum((freq - mean(freq))^2)) RMSE <- sqrt(sum((freq-freq.pred)^2)/(length(freq)-1)) pred.ci <- binconf(freq.pred*nrow(spp), nrow(spp), alpha=0.05, method="wilson", return.df=TRUE) ##Calculate AIC for binomial model bino.LL <- function(mu, sigma){ R = freq - pbinom(d, N, p, lower.tail=FALSE) R = dnorm(R, mu, sigma) -sum(log(R)) } bino.mle <- mle(bino.LL, start=list(mu=0, sigma=0.1), nobs=length(p)) aic.bino <- AIC(bino.mle, k=2) bic.bino <- BIC(bino.mle) ##Goodness of fit for binomial model bino.pred <- pbinom(d, N, p, lower.tail=FALSE) Rsqr.bino <- 1 - (sum((freq - bino.pred)^2))/(sum((freq - mean(freq))^2)) RMSE.bino <- sqrt(sum((freq - bino.pred)^2)/(length(freq) - 1)) bino.pred.ci <- binconf(bino.pred*nrow(spp), nrow(spp), alpha=0.05, method="wilson", return.df=TRUE) ##Calculate AIC for Poisson model pois.LL <- function(mu, sigma){ R = freq - ppois(d, N*p, lower.tail=FALSE) R = dnorm(R, mu, sigma) -sum(log(R)) } pois.mle <- mle(pois.LL, start=list(mu=0, sigma=0.1), nobs=length(p)) aic.pois <- AIC(pois.mle, k=2) bic.pois <- BIC(pois.mle) ##Goodness of fit for Poisson model pois.pred <- ppois(d, N*p, lower.tail=FALSE) Rsqr.pois <- 1 - (sum((freq - pois.pred)^2))/(sum((freq - mean(freq))^2)) RMSE.pois <- sqrt(sum((freq - pois.pred)^2)/(length(freq) - 1)) pois.pred.ci <- binconf(pois.pred*nrow(spp), nrow(spp), alpha=0.05, method="wilson", return.df=TRUE) ##Results if(stats==TRUE){ fitstats <- data.frame(m=numeric(), m.ci=numeric(), m.mle=numeric(), maxLL=numeric(), binoLL=numeric(), poisLL=numeric(), Rsqr=numeric(), Rsqr.bino=numeric(), Rsqr.pois=numeric(), RMSE=numeric(), RMSE.bino=numeric(), RMSE.pois=numeric(), AIC=numeric(), BIC=numeric(), AIC.bino=numeric(), BIC.bino=numeric(), AIC.pois=numeric(), BIC.pois=numeric(), N=numeric(), Samples=numeric(), Richness=numeric(), Detect=numeric()) fitstats[1,] <- c(coef(m.fit), coef(m.fit)-m.ci[1], m.mle@coef['m'], m.mle@details$value, bino.mle@details$value, pois.mle@details$value, Rsqr, Rsqr.bino, Rsqr.pois, RMSE, RMSE.bino, RMSE.pois, aic.fit, bic.fit, aic.bino, bic.bino, aic.pois, bic.pois, N, nrow(spp), length(p), d) return(fitstats) } else { A <- cbind(p, freq, freq.pred, pred.ci[,2:3], bino.pred, bino.pred.ci[,2:3]) A <- as.data.frame(A) colnames(A) <- c('p', 'freq', 'freq.pred', 'pred.lwr', 'pred.upr', 'bino.pred', 'bino.lwr', 'bino.upr') if(is.null(taxon)){ B <- A[order(A[,1]),] } else { B <- merge(A, taxon, by=0, all=TRUE) row.names(B) <- B[,1] B <- B[,-1] B <- B[order(B[,1]),] } return(B) } } #----------------------------------- spp=t(otu) taxon=as.vector(rownames(otu)) library(stats4) #Models for the whole community obs.np=sncm.fit(spp, taxon, stats=FALSE, pool=NULL) sta.np=sncm.fit(spp, taxon, stats=TRUE, pool=NULL) sta.np$m.mle sta.np$Rsqr above.pred=sum(obs.np$freq > (obs.np$pred.upr), na.rm=TRUE)/sta.np$Richness below.pred=sum(obs.np$freq < (obs.np$pred.lwr), na.rm=TRUE)/sta.np$Richness ap = obs.np$freq > (obs.np$pred.upr) bp = obs.np$freq < (obs.np$pred.lwr) ggplot() + geom_point(data=occ_abun[occ_abun$fill=='no',], aes(x=log10(otu_rel), y=otu_occ), pch=21, fill='white', alpha=.2)+ geom_point(data=occ_abun[occ_abun$fill!='no',], aes(x=log10(otu_rel), y=otu_occ), pch=21, fill='blue', size=1.8) + #geom_point(data=occ_abun, aes(x=log10(otu_rel), y=otu_occ), pch=21, fill='white', alpha=.2)+ geom_line(color='black', data=obs.np, size=1, aes(y=obs.np$freq.pred, x=log10(obs.np$p)), alpha=.25) + geom_line(color='black', lty='twodash', size=1, data=obs.np, aes(y=obs.np$pred.upr, x=log10(obs.np$p)), alpha=.25)+ geom_line(color='black', lty='twodash', size=1, data=obs.np, aes(y=obs.np$pred.lwr, x=log10(obs.np$p)), alpha=.25)+ labs(x="log10(mean relative abundance)", y="Occupancy") #Creating a plot of core taxa occupancy by time point core <- occ_abun$otu[occ_abun$fill == 'core'] coretaxa <- as.tibble(core)%>% rename(taxon_id=value)%>% left_join(runf$taxa)%>% mutate(core="core") otu_relabun <- decostand(otu, method="total", MARGIN=2) plotDF <- data.frame(otu = as.factor(row.names(otu_relabun)), otu_relabun) %>% gather(sample_id, relabun, -otu) %>% left_join(map, by = 'sample_id') %>% left_join(otu_ranked, bu='otu') %>% filter(otu %in% core) %>% group_by(otu, Plant_sample) %>% summarise(site_freq=sum(relabun>0)/length(relabun), coreSite=ifelse(site_freq == 1, 1, 0), detect=ifelse(site_freq > 0, 1, 0)) plotDF$otu <- factor(plotDF$otu, levels=otu_ranked$otu[1:34]) ggplot(plotDF,aes(x=otu, site_freq,fill=factor(Plant_sample))) + geom_bar(stat = 'identity', position = 'dodge') + coord_flip() + scale_x_discrete(limits = rev(levels(plotDF$otu))) + theme(axis.text = element_text(size=6)) + labs(x='Ranked OTUs', y='Occupancy by site') tax<-left_join(runf$taxa,coretaxa)%>% select(family, taxon_name, ASV_id,total_rel_abundance,rel_occurrence,core)%>% arrange(desc(total_rel_abundance))%>% arrange(core) taxc<-tax%>% filter(core=="core") write.csv(taxc,file="coreASV.txt",sep="\t") ############# #---------------------------------- # For top and bottom separately: runb <- runf %>% filter_samples(Side=="Bottom") runt <- runf %>% filter_samples(Side=="Top") # Top otu <- t(as_abundances_matrix(runt$abundances)) map <- runt$samples library(vegan) otu_PA <- 1*((otu>0)==1) # presence-absence data otu_occt <- rowSums(otu_PA)/ncol(otu_PA) # occupancy calculation otu_relt <- apply(decostand(otu, method="total", MARGIN=2),1, mean) # mean relative abundance occ_abunt <- add_rownames(as.data.frame(cbind(otu_occt, otu_relt)),'otu') otulist<-runt$taxa%>% select(otu=taxon_id,taxon_name)%>% left_join(tax)%>% select(otu,taxon_name,fill=core)%>% replace_na(list(fill="no")) occ_abunt<-left_join(occ_abunt,otulist,by="otu") spp=t(otu) taxon=as.vector(rownames(otu)) library(stats4) #Models for the whole community obs.npt=sncm.fit(spp, taxon, stats=FALSE, pool=NULL) sta.npt=sncm.fit(spp, taxon, stats=TRUE, pool=NULL) sta.npt$m.mle sta.npt$Rsqr above.predt=sum(obs.npt$freq > (obs.npt$pred.upr), na.rm=TRUE)/sta.npt$Richness below.predt=sum(obs.npt$freq < (obs.npt$pred.lwr), na.rm=TRUE)/sta.npt$Richness ap = obs.npt$freq > (obs.npt$pred.upr) bp = obs.npt$freq < (obs.npt$pred.lwr) above.predt below.predt ggplot() + geom_point(data=occ_abunt[occ_abunt$fill=='no',], aes(x=log10(otu_relt), y=otu_occt), pch=21, fill='white', alpha=.2,size=1.2)+ geom_point(data=occ_abunt[occ_abunt$fill!='no',], aes(x=log10(otu_relt), y=otu_occt), pch=21, fill='blue', size=1) + #geom_point(data=occ_abunt, aes(x=log10(otu_relt), y=otu_occt), pch=21, fill='white', alpha=.2)+ geom_line(color='darkred', data=obs.npt, size=1, aes(y=obs.npt$freq.pred, x=log10(obs.npt$p)), alpha=.75) + geom_line(color='darkred', lty='twodash', size=1, data=obs.npt, aes(y=obs.npt$pred.upr, x=log10(obs.npt$p)), alpha=.75)+ geom_line(color='darkred', lty='twodash', size=1, data=obs.npt, aes(y=obs.npt$pred.lwr, x=log10(obs.npt$p)), alpha=.75)+ labs(x="log10(mean relative abundance)", y="Occupancy") # Bottom otu <- t(as_abundances_matrix(runb$abundances)) map <- runb$samples otu_PA <- 1*((otu>0)==1) # presence-absence data otu_occb <- rowSums(otu_PA)/ncol(otu_PA) # occupancy calculation otu_relb <- apply(decostand(otu, method="total", MARGIN=2),1, mean) # mean relative abundance occ_abunb <- add_rownames(as.data.frame(cbind(otu_occb, otu_relb)),'otu') otulist<-runb$taxa%>% select(otu=taxon_id,taxon_name)%>% left_join(tax)%>% select(otu,taxon_name,fill=core)%>% replace_na(list(fill="no")) occ_abunb<-left_join(occ_abunb,otulist,by="otu") spp=t(otu) taxon=as.vector(rownames(otu)) obs.npb=sncm.fit(spp, taxon, stats=FALSE, pool=NULL) sta.npb=sncm.fit(spp, taxon, stats=TRUE, pool=NULL) sta.npb$m.mle sta.npb$Rsqr above.predb=sum(obs.npb$freq > (obs.npb$pred.upr), na.rm=TRUE)/sta.npb$Richness below.predb=sum(obs.npb$freq < (obs.npb$pred.lwr), na.rm=TRUE)/sta.npb$Richness ap = obs.npb$freq > (obs.npb$pred.upr) bp = obs.npb$freq < (obs.npb$pred.lwr) above.predb below.predb ggplot() + geom_point(data=occ_abunb[occ_abunb$fill=='no',], aes(x=log10(otu_relb), y=otu_occb), pch=21, fill='white', alpha=.2,size=1.2)+ geom_point(data=occ_abunb[occ_abunb$fill!='no',], aes(x=log10(otu_relb), y=otu_occb), pch=21, fill='blue', size=1) + #geom_point(data=occ_abunb, aes(x=log10(otu_relb), y=otu_occb), pch=21, fill='white', alpha=.2)+ geom_line(color='#003300', data=obs.npb, size=1, aes(y=obs.npb$freq.pred, x=log10(obs.npb$p)), alpha=.75) + geom_line(color='#003300', lty='twodash', size=1, data=obs.npb, aes(y=obs.npb$pred.upr, x=log10(obs.npb$p)), alpha=.75)+ geom_line(color='#003300', lty='twodash', size=1, data=obs.npb, aes(y=obs.npb$pred.lwr, x=log10(obs.npb$p)), alpha=.75)+ labs(x="log10(mean relative abundance)", y="Occupancy") ggplot() + #geom_point(data=occ_abun, aes(x=log10(otu_rel), y=otu_occ), pch=21, fill='white', alpha=.2)+ #geom_line(color='black', data=obs.np, size=1, aes(y=obs.np$freq.pred, x=log10(obs.np$p)), alpha=.75) + #geom_line(color='black', lty='twodash', size=1, data=obs.np, aes(y=obs.np$pred.upr, x=log10(obs.np$p)), alpha=.75)+ #geom_line(color='black', lty='twodash', size=1, data=obs.np, aes(y=obs.np$pred.lwr, x=log10(obs.np$p)), alpha=.75)+ geom_line(color='#003300', data=obs.npb, size=0.8, aes(y=obs.npb$freq.pred, x=log10(obs.npb$p)), alpha=.75) + geom_line(color='#003300', lty='twodash', size=0.8, data=obs.npb, aes(y=obs.npb$pred.upr, x=log10(obs.npb$p)), alpha=.75)+ geom_line(color='#003300', lty='twodash', size=0.8, data=obs.npb, aes(y=obs.npb$pred.lwr, x=log10(obs.npb$p)), alpha=.75)+ geom_line(color='darkred', data=obs.npt, size=0.8, aes(y=obs.npt$freq.pred, x=log10(obs.npt$p)), alpha=.75) + geom_line(color='darkred', lty='twodash', size=0.8, data=obs.npt, aes(y=obs.npt$pred.upr, x=log10(obs.npt$p)), alpha=.75)+ geom_line(color='darkred', lty='twodash', size=0.8, data=obs.npt, aes(y=obs.npt$pred.lwr, x=log10(obs.npt$p)), alpha=.75)+ labs(x="log10(mean relative abundance)", y="Occupancy")