<!--
 * @Descripttion: Parasitic taxa are key to the vertical stratification and community variation of pelagic ciliates from the surface to the abyssopelagic zone
 * @Author: Yuanyuan Wan
 * @Date: 2023-11-24 09:19:35
 * @LastEditors: Yuanyuan Wan
 * @LastEditTime: 2023-11-27 14:42:34
-->
# Parasitic taxa are key to the vertical stratification and community variation of pelagic ciliates from the surface to the abyssopelagic zone
## Wan et al. "Parasitic taxa are key to the vertical stratification and community variation of pelagic ciliates from the surface to the abyssopelagic zone"

This document contains the statistical analyses conducted in R (v.4.1.3) for the manuscript. Please not that raw sequencing data processing, ASV table attaining, alpha diversity, beta diversity, and taxonomic information were conducted under the Easy Amplicon pipeline (https://github.com/YongxinLiu/EasyAmplicon). 

## Bray-Curtis (dis)similarity
```r
# load package
library(vegan)
# import data
spe <- read.delim("otutab.txt", sep = '\t', row.names = 1, check.names = FALSE)
spe <- data.frame(t(spe))
# get Bray-Curtis dissimilarity matrix
comm_dis <- as.matrix(vegdist(spe, method = "bray"))
# get Bray-Curtis similarity matrix
comm_sim <- 1 - as.matrix(vegdist(spe, method = "bray"))
```

## Clustering
```r
# load packages
pacman::p_load(vegan, phangorn, factoextra, eclust)
# import data
data <- read.table("otutab.txt", header = T, row.names = 1)
data <- t(data)
# upgma
up = upgma(vegdist(data, method="bray"))
plot(up)
# hclust
hc = hclust(dist(data))
plot(hc)
plot(hc, hang = -1) #add clustering box for visually friendly. 
rect.hclust(hc,2) #rect.hclust(hc,k), k is the number of specified clusters to be clustered
# k-means based on silhouette coefficient
fviz_nbclust(data, kmeans, method = c("silhouette"))
km.res <- kmeans(data,2) #kmeans(data,k)，k is the optimal number of clusters showed in previous step
fviz_cluster(km.res, data = data)
# if error occurring as: cannot rescale a constant/zero column to unit variance, run next line below
# fviz_cluster(km.res, data[ , which(apply(data, 2, var) != 0)])
```

## Distance-decay
```r
# load packages
pacman::p_load(vegan, geosphere)
# import site data
site <- read.delim('site.txt', sep = '\t', row.names = 1, check.names = FALSE)
site_dis <- geosphere::distm(site[c("Longitude", "Latitude")])/1000
rownames(site_dis) <- rownames(site)
colnames(site_dis) <- rownames(site)
site_dis <- reshape2::melt(site_dis)
site_dis <- subset(site_dis, value != 0)
head(site_dis)
# import community data
spe <- read.delim("otutab.txt", sep = '\t', row.names = 1, check.names = FALSE)
spe <- data.frame(t(spe))
comm_sim <- 1 - as.matrix(vegan::vegdist(spe, method = "bray"))
diag(comm_sim) <- 0
comm_sim[upper.tri(comm_sim)] <- 0
comm_sim <- reshape2::melt(comm_sim)
comm_sim <- subset(comm_sim, value != 0)
head(comm_sim)
# merge distance and similarity results
comm_dis <- merge(comm_sim, site_dis, by = c('Var1', 'Var2'))
names(comm_dis) <- c('site1', 'site2', 'comm_sim', 'site_dis/km')
head(comm_dis)
# wirte output file
write.table(comm_dis, 'comm_dis.txt', sep = '\t', row.names = FALSE, quote = FALSE)
# construct distance-decay linear model
comm_dis <- read.delim('comm_dis.txt', sep = '\t')
comm_dis$site_dis_km <- comm_dis$site_dis/1000
fit <- lm(comm_sim~site_dis_km, data = comm_dis)
summary(fit)
```

## Mantel test
```r
# load packages
pacman::p_load(ecodist, vegan, geosphere)
# import data
spe <- read.table("otutab.txt", header=T, row.names = 1) # otu/asv table
spe <- t(spe)
group <- read.table("group.txt", header=T, row.names = 1) # group info
env <- read.table("env.txt", header = T, row.names = 1) #environmental table
# calculate matrices
spe.dist <- vegdist(spe, method = "bray") # BC matrix
env.dist <- vegdist(env, method = "euclidean") # environmantel matrix
ocean.dist <- vegdist(env$Tur, method = "euclidean") # oceanic matrix
## Mantel Test: Dissimilarity vs. Environmental Distance
env.mantel <- vegan::mantel(spe.dist, env.dist, method = "pearson", permutations = 999)
env.mantel
# Single Mantel Test
single.mantel <-function(spe,env, sim.method, correlation, p.adjust.m){
  library(vegan)
  co = colnames(env)
  factor = c()
  r = c()
  p.value=c()
  spe.dist<-vegdist(spe,method = sim.method)
  for(elem in 1:length(co)){
    env.dist = vegdist(env[elem], method = "euclidean")
    ad = vegan::mantel(spe.dist, env.dist, permutations = 999,method=correlation);    
    factor = c(factor,co[elem]);    
    r = c(r,ad$statistic);   
    p.value=c(p.value, ad$signif)
  }
  p.adjusted =p.adjust(p.value,method=p.adjust.m) 
  pairw.res = data.frame(factor, r, p.value, p.adjusted) 
  return(pairw.res)
}

sin.mantel = single.mantel(spe, env, sim.method="bray", correlation = "pearson", p.adjust.m= "fdr")
sin.mantel

## Mantel Test: Dissimilarity vs. Oceanic Distance
geo <- data.frame(group$Longitude, group$Latitude)
d.geo <- distm(geo, fun = distHaversine)
dist.geo <- as.dist(d.geo)
ocean.mantel <- vegan::mantel(spe.dist, dist.geo, method = "pearson", permutations = 999)
ocean.mantel

## Partial Mantel
# Dissimilarity vs. environmental distance(control oceanic distance)
mantel.partial(spe.dist, env.dist, dist.geo, method = "pearson", permutations = 999)
# Dissimilarity vs. oceanic distance(control environment)	
mantel.partial(spe.dist, dist.geo, env.dist, method = "pearson", permutations = 999)

## MRM
MRM <- MRM(spe.dist ~ env.dist + dist.geo, nperm=1000, method = "linear")
MRM

```

## Spearman correlation
```r
# load packages
pacman::p_load(psych, reshape2)
# import data
env <- read.delim('env.txt', sep = '\t', row.names = 1)
spe <- read.delim('otutab.txt', sep = '\t', row.names = 1)
spe <- spe[rownames(env), ]
# calculate spearman correlation
spearman <- corr.test(env, spe, method = 'spearman', adjust = 'fdr')
r <- data.frame(spearman$r)
p <- data.frame(spearman$p)
r$env <- rownames(r)
p$env <- rownames(p)
r <- melt(r, id = 'env')
p <- melt(p, id = 'env')
spearman <- cbind(r, p$value)
colnames(spearman) <- c('env', 'spe', 'spearman_correlation', 'p.value')
spearman$spe <- factor(spearman$spe, levels = colnames(spe))
head(spearman)
# visualization
library(ggplot2)
p1 <- ggplot() +
  geom_tile(data = spearman, aes(x = spe, y = env, fill = spearman_correlation)) +
  scale_fill_gradientn(colors = c('#2D6DB1', 'white', '#DC1623'), limit = c(-1, 1)) +
  theme(panel.grid = element_blank(), panel.background = element_rect(color = 'black'), legend.key = element_blank(), 
        axis.text.x = element_text(color = 'black', angle = 45, hjust = 1, vjust = 1), axis.text.y = element_text(color = 'black'), axis.ticks = element_line(color = 'black')) +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  labs(y = '', x = '', fill = 'Correlation')
p1

spearman[which(spearman$p.value<0.001),'sig'] <- '***'
spearman[which(spearman$p.value<0.01 & spearman$p.value>0.001),'sig'] <- '**'
spearman[which(spearman$p.value<0.05 & spearman$p.value>0.01),'sig'] <- '*'

p2 <- p1 +
  geom_text(data = spearman, aes(x = spe, y = env, label = sig), size = 3)
p2
```

## Normalized Stochasticity Ratios (NST)
```r
# load packages
pacman::p_load(NST, ape, iCAMP, picante)
# import data
otu <- t(read.table("oli.txt", row.names = 1, header = T)) # otu/asv table
group <- read.table("group.txt", row.names = 1, header = T) #group info
# check whether the otu table corresponds to the metadata
samp.ck <- NST::match.name(rn.list=list(otu=otu,group=group))
otu <- samp.ck$otu
otu <- otu[,colSums(otu)>0,drop=FALSE]
group <- samp.ck$group
# grouping and metacommunity seting
groupi <- group[,1,drop = FALSE]
prefix <- "oli"
prefixi <- paste0(prefix,".Group")
# if treatment and control are from different metacommunities, you may set meta.groupi=groupi，default = NULL
#meta.groupi=NULL
meta.groupi <- groupi
# calculate NST
# record running time
t1 <- Sys.time()
tnst <- tNST(comm=otu, group=groupi, meta.group=meta.groupi, meta.com=NULL, dist.method="bray", abundance.weighted=TRUE, rand=999, output.rand=TRUE, nworker=8, LB=FALSE, null.model="PF", between.group=TRUE, SES=TRUE, RC=TRUE)
# save and wirte the results
save(tnst, file = paste0(prefixi, ".tNST.rda"))
write.table(tnst$index.grp, file = paste0(prefixi, ".tNST.summary.txt"), quote = FALSE, sep = "\t")
write.table(tnst$index.pair.grp,file = paste0(prefixi,".tNST.pairwise.txt"),quote = FALSE,sep = "\t")
write.table(tnst$index.pair,file = paste0(prefixi,".tNST.pairwise.index.txt"),quote = FALSE,sep = "\t")
write.table(tnst$index.between,file = paste0(prefixi,".tNST.between.summary.txt"),quote = FALSE,sep = "\t")
write.table(tnst$index.pair.between,file = paste0(prefixi,".tNST.pairwise.between.txt"),quote = FALSE,sep = "\t")
format(Sys.time()-t1)

tnstbt=nst.boot(nst.result=tnst, group=groupi, rand=999, trace=TRUE, two.tail=FALSE, out.detail=TRUE, between.group=FALSE, nworker=8)
save(tnstbt,file = paste0(prefixi,".tNST.boot.rda"))
write.table(tnstbt$summary,file = paste0(prefixi,".tNST.boot.summary.txt"), quote = FALSE,sep = "\t")
write.table(tnstbt$compare,file = paste0(prefixi,".tNST.boot.compare.txt"), quote = FALSE,sep = "\t")
(t=format(Sys.time()-t1))
```

## Neutral Community Model (NCM)
```r
pacman::p_load(Hmisc, minpack.lm, stats4)
#using Non-linear least squares (NLS) to calculate R2:
#spp: A community table with taxa as rows and samples as columns
spp <- read.table('otu.txt', head=T, row.names = 1)
spp <- t(spp)

N <- mean(apply(spp, 1, sum))
p.m <- apply(spp, 2, mean)
p.m <- p.m[p.m != 0]
p <- p.m/N
spp.bi <- 1*(spp>0)
freq <- apply(spp.bi, 2, mean)
freq <- freq[freq != 0]
C <- merge(p, freq, by=0)
C <- C[order(C[,2]),]
C <- as.data.frame(C)
C.0 <- C[!(apply(C, 1, function(y) any(y == 0))),]
p <- C.0[,2]
freq <- C.0[,3]
names(p) <- C.0[,1]
names(freq) <- C.0[,1]
d = 1/N

##Fit model parameter m (or Nm) using Non-linear least squares (NLS)
m.fit <- nlsLM(freq ~ pbeta(d, N*m*p, N*m*(1 -p), lower.tail=FALSE),start=list(m=0.1))
m.fit #get the m value
m.ci <- confint(m.fit, 'm', level=0.95)
freq.pred <- pbeta(d, N*coef(m.fit)*p, N*coef(m.fit)*(1 -p), lower.tail=FALSE)
pred.ci <- binconf(freq.pred*nrow(spp), nrow(spp), alpha=0.05, method="wilson", return.df=TRUE)
Rsqr <- 1 - (sum((freq - freq.pred)^2))/(sum((freq - mean(freq))^2))
Rsqr# get the R2 value

#Optional: write 3 files: p.csv, freq.csv and freq.pred.csv
# write.csv(p, file = "d:/Work/WPO/result/NCM/D5_p.csv")
# write.csv(freq, file = "d:/Work/WPO/result/NCM/D5_freq.csv")
# write.csv(freq.pred, file = "d:/Work/WPO/result/NCM/D5_freq.pred.csv")
```

## Dispersal ability
```r
# load packages
pacman::p_load(tidyverse, geosphere, vegan)

# Obtain distance matrix
# site <- read.delim('site.txt', sep = '\t', row.names = 1, check.names = FALSE)
# site_dis <- geosphere::distm(site[c("Longitude", "Latitude")])/1000
# rownames(site_dis) <- rownames(site)
# colnames(site_dis) <- rownames(site)
# site_dis <- reshape2::melt(site_dis)
# site_dis <- subset(site_dis, value != 0)
# head(site_dis)
# 
# dis <- spread(site_dis, key = Var2, value = value)
# dis[is.na(dis)] <- 0
# write.table(dis, "dis.txt", sep = "\t", row.names = F)

# Import data
dis <- read.table('dis.txt', row.names = 1, header = T)
dis <- dis/1000
data <- read.csv('otutab_rare_D5.csv', header = T, row.names = 1)
# Obtain subests
last_row <- ncol(data)
last_row <- as.numeric(last_row)
cut_row <- last_row - 5

prefix <- 'Colpodea' #change taxa group
prefixi <- paste0(prefix,".D5") # change group name
spe <- subset(data, Class == prefix, select = -c(cut_row:last_row))
# Transform the OTU table to 1-0 format
spe[spe>0]=1

# Calculate connectivity for each patch
patch <- names(spe)
n <- ncol(spe)
a <- nrow(spe)
con <- c()

for (i in patch) {
  ijk <- 0
  for (j in setdiff(patch, i)) {
    jk <- 0
    #exp_dij <- exp(-dis[i,j])  #active
    exp_dij <- 1 / exp(-dis[i,j])  #passive
    num <- spe[,i]+spe[,j]
    num[num >0]=1
    m <- sum(num)
    for (k in 1:a) {
      jk <- jk + spe[k,j] * exp_dij
    }
    ijk <- 1/m * 1/(n-1) * jk + ijk
  }
  con <- c(con, ijk)
}

con <- data.frame(patch, con)
con
con<-na.omit(con)

# Calculate average connectivity for metacommunity
ave.con <- mean(con[ ,2])
ave.con
ac <- c('ave.con', ave.con)
output <- rbind(con, ac)

# Write the output
write.csv(output, paste0(prefixi, '.con.csv'), quote = FALSE)
```

## Correlations between subcommunity (Spirotrichea as example) and whole community
```r
# load packages
pacman::p_load(ggplot2, spaa, EcolUtils, vegan, permute, lattice, tidyverse, dplyr, ggpmisc, patchwork)
# calculating BC result
otu <- read.delim('spi.txt', row.names = 1)
otu <- t(otu)
dis <- vegan::vegdist(otu, method = 'bray')
dis <- as.matrix(dis)
write.table(dis, 'Bray-curtis.spi.txt', sep = '\t', col.names = NA, quote = FALSE)

dis <- read.delim('Bray-curtis.spi.txt', row.names = 1)
group <- read.delim('group.txt', stringsAsFactors = FALSE)

D5 <- subset(group, group == 'D5')$samples
dis_D5 <- dis[D5,D5]
D25 <- subset(group, group == 'D25')$samples
dis_D25 <- dis[D25,D25]
DCM <- subset(group, group == 'DCM')$samples
dis_DCM <- dis[DCM,DCM]
D200 <- subset(group, group == 'D200')$samples
dis_D200 <- dis[D200,D200]
D300 <- subset(group, group == 'D300')$samples
dis_D300 <- dis[D300,D300]
D500 <- subset(group, group == 'D500')$samples
dis_D500 <- dis[D500,D500]
D750 <- subset(group, group == 'D750')$samples
dis_D750 <- dis[D750,D750]
D1000 <- subset(group, group == 'D1000')$samples
dis_D1000 <- dis[D1000,D1000]
D2000 <- subset(group, group == 'D2000')$samples
dis_D2000 <- dis[D2000,D2000]
D3000 <- subset(group, group == 'D3000')$samples
dis_D3000 <- dis[D3000,D3000]
Bottom <- subset(group, group == 'Bottom')$samples
dis_Bottom <- dis[Bottom,Bottom]

dis_D5 <- as.vector(as.dist(dis_D5))
dis_D25 <- as.vector(as.dist(dis_D25))
dis_DCM <- as.vector(as.dist(dis_DCM))
dis_D200 <- as.vector(as.dist(dis_D200))
dis_D300 <- as.vector(as.dist(dis_D300))
dis_D500 <- as.vector(as.dist(dis_D500))
dis_D750 <- as.vector(as.dist(dis_D750))
dis_D1000 <- as.vector(as.dist(dis_D1000))
dis_D2000 <- as.vector(as.dist(dis_D2000))
dis_D3000 <- as.vector(as.dist(dis_D3000))
dis_Bottom <- as.vector(as.dist(dis_Bottom))

dat <- data.frame(
  dis = c(dis_D5, dis_D25, dis_DCM, dis_D200, dis_D300, dis_D500, dis_D750, dis_D1000, dis_D2000, dis_D3000, dis_Bottom),
  group = factor(c(
    rep('D5', length(dis_D5)),
    rep('D25', length(dis_D25)),
    rep('DCM', length(dis_DCM)),
    rep('D200', length(dis_D200)),
    rep('D300', length(dis_D300)),
    rep('D500', length(dis_D500)),
    rep('D750', length(dis_D750)),
    rep('D1000', length(dis_D1000)),
    rep('D2000', length(dis_D2000)),
    rep('D3000', length(dis_D3000)),
    rep('Bottom', length(dis_Bottom))
  ), levels = c("D5","D25","DCM","D200","D300","D500","D750","D1000","D2000","D3000","Bottom")))

dat <- na.omit(dat)
write.table(dat, 'Bray-curtis_pair.spi.txt', sep = '\t', col.names = NA, quote = FALSE)
all <- read.table('Bray-curtis_pair.all.txt', header = T, row.names = 1)
spi <- read.table('Bray-curtis_pair.spi.txt', header = T, row.names = 1)
all.sub <- subset(all, group == 'Bottom')#change label(group name)
dis.all.sub <- all.sub$dis
spi.sub <- subset(spi, group == 'Bottom')#change label(group name)
dis.spi.sub <- spi.sub$dis
# making import data for linear models
dat <- data.frame(dis.all.sub, dis.spi.sub)#change the number to 'dis.oli.sub' num
colnames(dat) <- c('all', 'dis')
write.table(dat, 'Bottom.lm.txt', sep = '\t', quote = F, row.names = F, col.names = T)#change label(group name)
# lm
lm_d5 <- read.table('D5.lm.txt', header = T)
lm_d25 <- read.table('D25.lm.txt', header = T)
lm_dcm <- read.table('DCM.lm.txt', header = T)
lm_d200 <- read.table('D200.lm.txt', header = T)
lm_d300 <- read.table('D300.lm.txt', header = T)
lm_d500 <- read.table('D500.lm.txt', header = T)
lm_d750 <- read.table('D750.lm.txt', header = T)
lm_d1000 <- read.table('D1000.lm.txt', header = T)
lm_d2000 <- read.table('D2000.lm.txt', header = T)
lm_d3000 <- read.table('D3000.lm.txt', header = T)
lm_bot <- read.table('Bottom.lm.txt', header = T)

# fit <- lm(all~dis, data = lm)
# summary(fit) # to get R-squared value
```