### Load the packages
#install.packages("tidyr")
#install_github("jfq3/QsRutils", force = TRUE)
library(QsRutils)
#library(dada2)
library(DESeq2)
#library(phangorn)
#library(plotly)
#library(gplots)
library(decontam)
library(DescTools)
library(multtest)
library(foreach)
library(stringi)
library(Hmisc)
library(parallel)
library(iterators)
library(phyloseq)
library(ape)
library(ggplot2)
library(plyr)
library(gridExtra)
library(lsmeans)
library(multcompView)
library(vegan)
library(dplyr)
library(devtools)
library(metagMisc)
library(metagenomeSeq)
library(microbiome)
library(btools)
library(plyr)
library(reshape2)
library(theseus)
#library(phangorn)
library(microbiomeSeq)
library(tidyr)
#library(DECIPHER)

##load the new dataset 
load("/Users/francois-etiennesylvain/Documents/Doctorat/full_decont_phyloseq.RData")


##remove ASVs < 1 read
my_phyloseq_filt_no_single <- filter_taxa(my_phyloseq_filt, function (x) {sum(x) > 1}, prune=TRUE)

##add ecosystem variable
meta_my_phyloseq_filt_no_single = sample_data(my_phyloseq_filt_no_single)
#write.table(meta_my_phyloseq_filt_no_single, file = "/Users/francois-etiennesylvain/Documents/Doctorat/meta_bacterioplankton_extract.txt") 
ecosystem_variable <- read.table("/Users/francois-etiennesylvain/Documents/Doctorat/meta_bacterioplankton_extract_mod.txt", sep = "\t", header = TRUE)
ecosystem_variable1 = ecosystem_variable[, 11]
sample_data(my_phyloseq_filt_no_single)$Ecosystem <- unlist(ecosystem_variable1)

##separate DNA from RNA in new dataset
new_DNA = subset_samples(my_phyloseq_filt_no_single, TYPE %in% "DNA") 
new_RNA = subset_samples(my_phyloseq_filt_no_single, TYPE %in% "RNA") 

### after separation, remove once again ASVs with less than 1 read
new_DNA <- filter_taxa(new_DNA, function (x) {sum(x) > 1}, prune=TRUE)
new_RNA <- filter_taxa(new_RNA, function (x) {sum(x) > 1}, prune=TRUE)

##Normalize data

new_DNA.relative <- transform_sample_counts(new_DNA, function(otu) otu/sum(otu))
new_RNA.relative <- transform_sample_counts(new_RNA, function(otu) otu/sum(otu))
#my_phyloseq_filt_no_single.relative <- transform_sample_counts(my_phyloseq_filt_no_single.relative, function(otu) otu/sum(otu))

### Polynucleobacter barplot

poly.DNA <- subset_taxa(new_DNA.relative, Genus == "Polynucleobacter")
poly.RNA <- subset_taxa(new_RNA.relative, Genus == "Polynucleobacter")

#Make a function for barplots without the black lines ######
my_plot_bar = function (physeq, x = "Sample", y = "Abundance", fill = NULL, title = NULL, 
                        facet_grid = NULL) {
  mdf = psmelt(physeq)
  p = ggplot(mdf, aes_string(x = x, y = y, fill = fill))
  p = p + geom_bar(stat = "identity", position = "stack")
  p = p + theme(axis.text.x = element_text(angle = -90, hjust = 0))
  if (!is.null(facet_grid)) {
    p <- p + facet_grid(facet_grid)
  }
  if (!is.null(title)) {
    p <- p + ggtitle(title)
  }
  return(p)
}

poly.DNA_merged = merge_samples(poly.DNA, "Water_color")
my_plot_bar(poly.DNA_merged,  fill ="Water_color") #+ scale_fill_manual(values = palette_2)
a <- rowSums(otu_table(poly.DNA_merged))


poly.RNA_merged = merge_samples(poly.RNA, "Water_color")
my_plot_bar(poly.RNA_merged,  fill ="Water_color") #+ scale_fill_manual(values = palette_2)
a <- rowSums(otu_table(poly.RNA_merged))


### Exosystem variable

# In black waters

new_DNA_black = subset_samples(new_DNA, Water_color %in% c("Black_water"))
metadata <- as(sample_data(new_DNA_black), "data.frame")
adonis(phyloseq::distance(new_DNA_black, method="bray") ~ Ecosystem,
       data = metadata)

new_RNA_black = subset_samples(new_RNA, Water_color %in% c("Black_water"))
metadata <- as(sample_data(new_RNA_black), "data.frame")
adonis(phyloseq::distance(new_RNA_black, method="bray") ~ Ecosystem,
       data = metadata)

# In white waters

new_DNA_white = subset_samples(new_DNA, Water_color %in% c("White_water"))
metadata <- as(sample_data(new_DNA_white), "data.frame")
adonis(phyloseq::distance(new_DNA_white, method="bray") ~ Ecosystem,
       data = metadata)

new_RNA_white = subset_samples(new_RNA, Water_color %in% c("White_water"))
metadata <- as(sample_data(new_RNA_white), "data.frame")
adonis(phyloseq::distance(new_RNA_white, method="bray") ~ Ecosystem,
       data = metadata)

# In all waters

metadata <- as(sample_data(new_DNA), "data.frame")
adonis(phyloseq::distance(new_DNA, method="bray") ~ Ecosystem,
       data = metadata)

metadata <- as(sample_data(new_RNA), "data.frame")
adonis(phyloseq::distance(new_RNA, method="bray") ~ Ecosystem,
       data = metadata)


###PCoA - Bray-Curtis distance

ordu = ordinate(new_DNA, "PCoA", "bray")
p = plot_ordination(new_DNA, ordu, color="Water_color", shape="Water_color") +
  geom_point(size=5, alpha=0.75) +
  scale_colour_brewer(type="qual", palette="Set1") +
  ggtitle("PCoA - BrayCurtis - new_DNA") +
  stat_ellipse(geom = "polygon", type="norm", alpha=0.04, aes(fill=Water_color)) + 
  theme_bw()
p

ordu = ordinate(new_RNA, "PCoA", "bray")
p = plot_ordination(new_RNA, ordu, color="Water_color", shape="Water_color") +
  geom_point(size=5, alpha=0.75) +
  scale_colour_brewer(type="qual", palette="Set1") +
  ggtitle("PCoA - BrayCurtis - new_RNA") +
  stat_ellipse(geom = "polygon", type="norm", alpha=0.04, aes(fill=Water_color)) + 
  theme_bw()
p

ordu = ordinate(my_phyloseq_filt_no_single, "PCoA", "bray")
p = plot_ordination(my_phyloseq_filt_no_single, ordu, color="Water_color", shape="Water_color") +
  geom_point(size=5, alpha=0.75) +
  scale_colour_brewer(type="qual", palette="Set1") +
  ggtitle("PCoA - BrayCurtis - my_phyloseq_filt_no_single") +
  stat_ellipse(geom = "polygon", type="norm", alpha=0.04, aes(fill=Water_color)) + 
  theme_bw()
p

###tests ADONIS

metadata <- as(sample_data(new_DNA), "data.frame")
adonis(phyloseq::distance(new_DNA, method="bray") ~ Water_color,
       data = metadata)

metadata <- as(sample_data(new_RNA), "data.frame")
adonis(phyloseq::distance(new_RNA, method="bray") ~ Water_color,
       data = metadata)

metadata <- as(sample_data(my_phyloseq_filt_no_single), "data.frame")
adonis(phyloseq::distance(my_phyloseq_filt_no_single, method="bray") ~ Water_color,
       data = metadata)


###NMDS - Bray-Curtis distance
#ordinate
new_DNA.nmds.bray <- ordinate(new_DNA.relative, method="NMDS", distance="bray")
new_RNA.nmds.bray <- ordinate(new_RNA.relative, method="NMDS", distance="unifrac", weighted = T)
my_phyloseq_filt_no_single.nmds.bray <- ordinate(my_phyloseq_filt_no_single.relative, method="NMDS", distance="bray")
#plot ordination by site and water color
plot_ordination(new_DNA.relative, new_DNA.nmds.bray, color="Site", shape = "Water_color", title="Bray NMDS by sample")
plot_ordination(new_RNA.relative, new_RNA.nmds.bray, color="Site", shape = "Water_color", title="Bray NMDS by sample")
plot_ordination(my_phyloseq_filt_no_single.relative, my_phyloseq_filt_no_single.nmds.bray, color="Site", shape = "Water_color", title="Bray NMDS by sample")
#plot ordination by water color
plot_ordination(DNA.relative, ordination_DNA.nmds.bray, color="Water_color", title="Bray NMDS by sample") + stat_ellipse()
plot_ordination(DNA.relative, ordination_DNA.nmds.bray, color="Water_color", title="Bray NMDS by sample") + stat_ellipse()
plot_ordination(DNA.relative, ordination_DNA.nmds.bray, color="Water_color", title="Bray NMDS by sample") + stat_ellipse()
plot_ordination(new_RNA.relative, new_RNA.nmds.bray, color="Water_color", title="Bray NMDS by sample") + stat_ellipse(level = 0.90)
plot_ordination(DNA.relative, ordination_DNA.nmds.bray, color="Water_color", title="Bray NMDS by sample") + stat_ellipse()

###Pairwise black versus white

new_DNA_blackwhite = subset_samples(new_DNA, Water_color %in% c("Black_water", "White_water"))
new_RNA_blackwhite = subset_samples(new_RNA, Water_color %in% c("Black_water", "White_water"))

metadata <- as(sample_data(new_DNA_blackwhite), "data.frame")
adonis(phyloseq::distance(new_DNA_blackwhite, method="bray") ~ Water_color,
       data = metadata)

metadata <- as(sample_data(new_RNA_blackwhite), "data.frame")
adonis(phyloseq::distance(new_RNA_blackwhite, method="bray") ~ Water_color,
       data = metadata)

### Network

# Select the most abundant ASVs (to speed the correlation calculation)

Abund_DNA = filter_taxa(new_DNA.relative, function(x) mean(x) > 0.0001, TRUE)

### Export bacteriopankton ASV/OTU tables and tax tables in .csv files

Export_ASV_table_DNA <- as.data.frame(otu_table(Abund_DNA))
write.csv(Export_ASV_table_DNA, file="/Users/francois-etiennesylvain/Documents/Doctorat/new_ASV_table_abund_bacterioplankton_DNA.csv")
Export_tax_table_DNA <- as.data.frame(tax_table(Abund_DNA))
write.csv(Export_tax_table_DNA, file="/Users/francois-etiennesylvain/Documents/Doctorat/new_Tax_table_abund_bacterioplankton_DNA.csv")
Export_metadata_table_DNA <- as.data.frame(sample_data(Abund_DNA))
write.csv(Export_metadata_table_DNA, file="/Users/francois-etiennesylvain/Documents/Doctorat/new_Metadata_table_abund_bacterioplankton_DNA.csv")

### Compute Spearman correlations.

DNA.network<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/new_Combined_ASV_table_abund_bacterioplankton_DNA.txt",  header=TRUE, row.names=1)

## Correct the dataframe format (the number in [] is the number of columns (i.e. ASVs) in the ASV table)

DNA.network=sapply(DNA.network[1:1884],as.numeric)

# Calculate Spearman correlation values

corr_DNA <- rcorr(DNA.network, type="spearman")

# Extract, correct and filter p.values + correlations

gene <- colnames(corr_DNA$P)
Sample.comparison <- paste(gene[row(corr_DNA$P)], gene[col(corr_DNA$P)], sep=".")
i <- lower.tri(corr_DNA$P)
Dat <- data.frame(Sample.comparison[i], p.value=corr_DNA$P[i])
Dat$Bonferroni <- p.adjust(Dat$p.value, method="bonferroni")
gene <- colnames(corr_DNA$r)
Sample.comparison <- paste(gene[row(corr_DNA$r)], gene[col(corr_DNA$r)], sep=".")
i <- lower.tri(corr_DNA$r)
Correlation <- data.frame(Sample.comparison[i], correlation.spearman=corr_DNA$r[i], Dat$Bonferroni)
Filt_Bonferroni <- subset(Correlation, Dat.Bonferroni < 0.05 & correlation.spearman > 0.6)
write.csv(Filt_Bonferroni, file="/Users/francois-etiennesylvain/Documents/Doctorat/new_combined_p_values_Network_abund_DNA.csv")


### Make the list of functions found in Polynucleobacter sinensis from the total list of KEGG pathways possible 
### + the KEGG pathways foudn in P. sinensis

KEGG_ALL<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/KEGG_ALL.txt",  header=TRUE)
KEGG_POLY<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/KEGG_POLY.txt",  header=TRUE)

KEGG_ALL_unique = unique(KEGG_ALL, by = "KEGG_ID")
KEGG_POLY_unique = unique(KEGG_POLY, by = "KEGG_POLY")

POLY_list = subset(KEGG_ALL_unique, KEGG_ID %in% KEGG_POLY_unique$KEGG_POLY)
write.csv(POLY_list, file="/Users/francois-etiennesylvain/Documents/Doctorat/POLY_list.csv")


### Now check which of the lignin degradation pathways are found in the overall dataset

ALL_LIGNIN_DEGRAD<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/lignin_degradation_pathways.txt",  header=TRUE)
ALL_MY_PATHWAYS<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/DNARNA_KOtable_GENUS_wabundance.tsv",  header=TRUE)

MY_LIGNIN = subset(ALL_MY_PATHWAYS, KO_ID %in% ALL_LIGNIN_DEGRAD$KEGG_ID)
write.csv(MY_LIGNIN, file="/Users/francois-etiennesylvain/Documents/Doctorat/MY_LIGNIN_PATHWAYS.csv")

### If we assume that pathways actively playing a role in HUMIC degradation should be correlated to humic abundance,
### let's make a list of these pathways... 

MY_LIGNIN_HUMIC_DNA<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/MY_LIGNIN_HUMIC_DNA.txt",  header=TRUE, row.names=1)
MY_LIGNIN_HUMIC_RNA<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/MY_LIGNIN_HUMIC_RNA.txt",  header=TRUE, row.names=1)

# we remove columns with a sum of zero (possible because DNA and RNA samples were computed together at first and then separed)
MY_LIGNIN_HUMIC_DNA = MY_LIGNIN_HUMIC_DNA[, which(colSums(MY_LIGNIN_HUMIC_DNA) != 0)]
MY_LIGNIN_HUMIC_RNA = MY_LIGNIN_HUMIC_RNA[, which(colSums(MY_LIGNIN_HUMIC_RNA) != 0)]

# Correlations DNA samples
MY_LIGNIN_HUMIC_DNA=sapply(MY_LIGNIN_HUMIC_DNA[1:481],as.numeric)
corr_DNA <- rcorr(MY_LIGNIN_HUMIC_DNA, type="spearman")
gene <- colnames(corr_DNA$P)
Sample.comparison <- paste(gene[row(corr_DNA$P)], gene[col(corr_DNA$P)], sep=".")
i <- lower.tri(corr_DNA$P)
Dat <- data.frame(Sample.comparison[i], p.value=corr_DNA$P[i])
Dat$Bonferroni <- p.adjust(Dat$p.value, method="bonferroni")
gene <- colnames(corr_DNA$r)
Sample.comparison <- paste(gene[row(corr_DNA$r)], gene[col(corr_DNA$r)], sep=".")
i <- lower.tri(corr_DNA$r)
Correlation <- data.frame(Sample.comparison[i], correlation.spearman=corr_DNA$r[i], Dat$Bonferroni)
Filt_Bonferroni <- subset(Correlation)
Filt_Bonferroni<-Filt_Bonferroni[!(Filt_Bonferroni$correlation.spearman==1),]
write.csv(Filt_Bonferroni, file="/Users/francois-etiennesylvain/Documents/Doctorat/combined_p_values_HUMIC_PATHWAYS_DNA.csv")

# Correlations RNA samples
MY_LIGNIN_HUMIC_RNA=sapply(MY_LIGNIN_HUMIC_RNA[1:580],as.numeric)
corr_DNA <- rcorr(MY_LIGNIN_HUMIC_RNA, type="spearman")
gene <- colnames(corr_DNA$P)
Sample.comparison <- paste(gene[row(corr_DNA$P)], gene[col(corr_DNA$P)], sep=".")
i <- lower.tri(corr_DNA$P)
Dat <- data.frame(Sample.comparison[i], p.value=corr_DNA$P[i])
Dat$Bonferroni <- p.adjust(Dat$p.value, method="bonferroni")
gene <- colnames(corr_DNA$r)
Sample.comparison <- paste(gene[row(corr_DNA$r)], gene[col(corr_DNA$r)], sep=".")
i <- lower.tri(corr_DNA$r)
Correlation <- data.frame(Sample.comparison[i], correlation.spearman=corr_DNA$r[i], Dat$Bonferroni)
Filt_Bonferroni <- subset(Correlation, Dat.Bonferroni < 0.05 & correlation.spearman > 0.6)
write.csv(Filt_Bonferroni, file="/Users/francois-etiennesylvain/Documents/Doctorat/new_combined_p_values_HUMIC_PATHWAYS_RNA.csv")

### now we will illustrate these results via a heatmap

#first we load the matrix
LIGNIN_TAX_FUNC_CORR_DNA<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/LIGNIN_TAX_FUNC_CORR_DNA_MATRIX.txt",  header=TRUE, row.names =1)
LIGNIN_TAX_FUNC_CORR_DNA_mat <- as.matrix(as.data.frame(LIGNIN_TAX_FUNC_CORR_DNA))

#then we build the heatmap

LIGNIN_TAX_FUNC_CORR_DNA_mat_t = t(LIGNIN_TAX_FUNC_CORR_DNA_mat)
heatmap(LIGNIN_TAX_FUNC_CORR_DNA_mat, scale="none", Colv = NA, Rowv = NA, cexCol=1,cexRow=1.5, margins=c(11,4))


### we want to understand how different water types affect the clustering of the microbiome:
### structure, expression profile, and functionnal profile. To do so we compute and plot NMDS analyses.

#try to combine samples per site

new_DNA_merged = merge_samples(new_DNA, group = "Site")
new_RNA_merged = merge_samples(new_RNA, group = "Site")

new_DNA_merged.relative <- transform_sample_counts(new_DNA_merged, function(otu) otu/sum(otu))
new_RNA_merged.relative <- transform_sample_counts(new_RNA_merged, function(otu) otu/sum(otu))

#si la convergence ne se fait pas, juste à refaire la commande jusqu'à ce qu'elle se fasse
ordination_DNA.NMDS.unifrac <- ordinate(new_DNA_merged.relative, method="NMDS", distance="unifrac")
ordination_RNA.NMDS.unifrac <- ordinate(new_RNA_merged.relative, method="NMDS", distance="unifrac")

metadf_DNA_filt5 = meta(new_DNA_merged) 
metadata_fit_DNA = envfit(ordination_DNA.NMDS.unifrac, metadf_DNA_filt5[c("DOC","DOC_SAC340",
                                                                          "DOC_SUVA254","DOC_abs254.365","fulvic_like_DOC",
                                                                          "humic_like_DOC","protein_like_DOC","Na","Mg","K",
                                                                          "Ca","Cl","Nitrite","Nitrate","Silicate","Chl_a",
                                                                          "Pheopigments","Chla.DOC","Temperature","Conductivity",
                                                                          "pH", "Humic.tot", "fulvic.tot", "Al","V","Cr","Mn","Fe","Co",
                                                                          "Ni","Cu","Zn","As","Cd","Pb")])
metadf_RNA_filt5 = meta(new_RNA_merged) 
metadata_fit_RNA = envfit(ordination_RNA.NMDS.unifrac, metadf_RNA_filt5[c("DOC","DOC_SAC340",
                                                                          "DOC_SUVA254","DOC_abs254.365","fulvic_like_DOC",
                                                                          "humic_like_DOC","protein_like_DOC","Na","Mg","K",
                                                                          "Ca","Cl","Nitrite","Nitrate","Silicate","Chl_a",
                                                                          "Pheopigments","Chla.DOC","Temperature","Conductivity",
                                                                          "pH", "Humic.tot", "fulvic.tot", "Al","V","Cr","Mn","Fe","Co",
                                                                          "Ni","Cu","Zn","As","Cd","Pb")])

meta_new_DNA = meta(new_DNA_merged)
meta_new_DNA$Water_color = c("Black","Clear", "Black", "White", "White", "Black", "White", "White", "Black", "White", "White", "Clear", "Clear", "Black", "White")

#NMDS DNA 10 MOST IMPACTFUL PARAM
plot(ordination_DNA.NMDS.unifrac, type="n", main="Unifrac NMDS DNA")
#rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "gray98")
#legend(-3,0, legend=c("Black_water","Clear_water","White_water"), col=c("gray25", "lightskyblue", "burlywood"), pch=20, yjust = 0, y.intersp = 0.15, x.intersp = 0.15, bty = "n")
#ellipses
ordiellipse(ordination_DNA.NMDS.unifrac, groups=meta_new_DNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="gray25", draw="polygon", alpha=150, show.groups = c("Black"), border="gray25", lwd = 1)
ordiellipse(ordination_DNA.NMDS.unifrac, groups=meta_new_DNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="lightskyblue", draw="polygon", alpha=150, border = "lightskyblue", show.groups = c("Clear"), lwd = 1)
ordiellipse(ordination_DNA.NMDS.unifrac, groups=meta_new_DNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="burlywood", draw="polygon", alpha=150, border = "burlywood", show.groups = c("White"), lwd = 1)
points(ordination_DNA.NMDS.unifrac, pch=21, lwd=1, cex=2, display="sites", col = "black", bg=c("gray25", "lightskyblue", "burlywood")[factor(meta_new_DNA$Water_color)]) 
#ordiellipse(ordination_DNA.nmds.bray, groups=metadata_df_DNA$Water_color, display="sites", kind="se", conf=0.99, label=FALSE, col="gray", draw="polygon", alpha=100, show.groups = c("NA"), border=FALSE)
#Add fitted variables
metadf_DNA_filt5 = meta(new_DNA_merged) 
metadata_fit_DNA = envfit(ordination_DNA.NMDS.unifrac, metadf_DNA_filt5[c("fulvic_like_DOC","DOC","DOC_SAC340","humic_like_DOC","DOC_SUVA254","Chl_a","Chla.DOC","pH","Al","Pb")])
plot(metadata_fit_DNA, cex=0.0001, col="black")


meta_new_RNA = meta(new_RNA_merged)
meta_new_RNA$Water_color = c("Black","Clear", "Black", "White", "White", "Black", "White", "White", "Black", "White", "White", "Clear", "Clear", "Black", "White")

#NMDS RNA 10 MOST IMPACTFUL PARAM
plot(ordination_RNA.NMDS.unifrac, type="n", main="Unifrac NMDS RNA")
#rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "gray98")
#legend(-3,0, legend=c("Black_water","Clear_water","White_water"), col=c("gray25", "lightskyblue", "burlywood"), pch=20, yjust = 0, y.intersp = 0.15, x.intersp = 0.15, bty = "n")
#ellipses
ordiellipse(ordination_RNA.NMDS.unifrac, groups=meta_new_RNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="gray25", draw="polygon", alpha=150, show.groups = c("Black"), border="gray25", lwd = 1)
ordiellipse(ordination_RNA.NMDS.unifrac, groups=meta_new_RNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="lightskyblue", draw="polygon", alpha=150, border = "lightskyblue", show.groups = c("Clear"), lwd = 1)
ordiellipse(ordination_RNA.NMDS.unifrac, groups=meta_new_RNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="burlywood", draw="polygon", alpha=150, border = "burlywood", show.groups = c("White"), lwd = 1)
points(ordination_RNA.NMDS.unifrac, pch=21, lwd=1, cex=2, display="sites", col = "black", bg=c("gray25", "lightskyblue", "burlywood")[factor(meta_new_RNA$Water_color)]) 
#ordiellipse(ordination_RNA.nmds.bray, groups=metadata_df_RNA$Water_color, display="sites", kind="se", conf=0.99, label=FALSE, col="gray", draw="polygon", alpha=100, show.groups = c("NA"), border=FALSE)
#Add fitted variables
metadf_RNA_filt5 = meta(new_RNA_merged) 
metadata_fit_RNA = envfit(ordination_RNA.NMDS.unifrac, metadf_RNA_filt5[c("pH","Cd","DOC","Ca","Chla.DOC","Conductivity","Al","Ni","Pb","K")])
plot(metadata_fit_RNA, cex=0.0001, col="black")


#before computing NMDS for functions samples, we need to have a functions table that we can use for this type of analysis. So, we load a KEGG function
#per genus table to begin with

KEGG_ALL<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/KEGG_ALL.txt",  header=TRUE)
KEGG_ALL_unique = unique(KEGG_ALL, by = "KEGG_ID")
function_genus <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/DNARNA_KOtable_GENUS.txt",  header=TRUE)

function_genus_true <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/DNARNA_KOtable_GENUS.txt",  header=TRUE, row.names = 1, check.names = FALSE)
metadata_function <- meta(my_phyloseq_filt_no_single)
function_taxonomy = subset(KEGG_ALL_unique, KEGG_ID %in% function_genus$KEGG_ID)
function_taxonomy = unique(function_taxonomy, by = "KEGG_ID")
rownames(function_taxonomy) = make.names(function_taxonomy[,1], unique=TRUE)
function_taxonomy[,1] <- NULL

functions_phyloseq <- phyloseq(otu_table(function_genus_true, taxa_are_rows=TRUE), sample_data(metadata_function), tax_table(as.matrix(function_taxonomy)))

##separate DNA from RNA in the functions dataset

DNA_functions = subset_samples(functions_phyloseq, TYPE %in% "DNA") 
#RNA_functions = subset_samples(functions_phyloseq, TYPE %in% "RNA") #we will use the DNA dataset in this project
DNA_functions_merged = merge_samples(DNA_functions, group = "Site")
#RNA_functions_merged = merge_samples(new_RNA, group = "Site")  #we will use the DNA dataset in this project

#solution reached! redo if solution is not reached:
#ordination_functions.NMDS.unifrac <- ordinate(DNA_functions_merged, method="NMDS", distance="bray")
ordination_functions.NMDS.unifrac <- ordinate(DNA_functions_merged, method="NMDS", distance="bray")

metadf_functions = meta(DNA_functions_merged) 
metadata_fit_DNA = envfit(ordination_functions.NMDS.unifrac, metadf_functions[c("DOC","DOC_SAC340",
                                                                          "DOC_SUVA254","DOC_abs254.365","fulvic_like_DOC",
                                                                          "humic_like_DOC","protein_like_DOC","Na","Mg","K",
                                                                          "Ca","Cl","Nitrite","Nitrate","Silicate","Chl_a",
                                                                          "Pheopigments","Chla.DOC","Temperature","Conductivity",
                                                                          "pH", "Humic.tot", "fulvic.tot", "Al","V","Cr","Mn","Fe","Co",
                                                                          "Ni","Cu","Zn","As","Cd","Pb")])

metadf_functions$Water_color = c("Black","Clear", "Black", "White", "White", "Black", "White", "White", "Black", "White", "White", "Clear", "Clear", "Black", "White")

#NMDS FUNCTIONS 10 MOST IMPACTFUL PARAM
plot(ordination_functions.NMDS.unifrac$points, type="n", main="Bray NMDS functions", xlim = c(-0.5,0.35), ylim = c(-0.20,0.20))
#rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "gray98")
#legend(-3,0, legend=c("Black_water","Clear_water","White_water"), col=c("gray25", "lightskyblue", "burlywood"), pch=20, yjust = 0, y.intersp = 0.15, x.intersp = 0.15, bty = "n")
#ellipses
ordiellipse(ordination_functions.NMDS.unifrac, groups=metadf_functions$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="gray25", draw="polygon", alpha=150, show.groups = c("Black"), border="gray25", lwd = 1)
ordiellipse(ordination_functions.NMDS.unifrac, groups=metadf_functions$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="lightskyblue", draw="polygon", alpha=150, border = "lightskyblue", show.groups = c("Clear"), lwd = 1)
ordiellipse(ordination_functions.NMDS.unifrac, groups=metadf_functions$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="burlywood", draw="polygon", alpha=150, border = "burlywood", show.groups = c("White"), lwd = 1)
points(ordination_functions.NMDS.unifrac, pch=21, lwd=1, cex=2, display="sites", col = "black", bg=c("gray25", "lightskyblue", "burlywood")[factor(metadf_functions$Water_color)]) 
#ordiellipse(ordination_DNA.nmds.bray, groups=metadata_df_DNA$Water_color, display="sites", kind="se", conf=0.99, label=FALSE, col="gray", draw="polygon", alpha=100, show.groups = c("NA"), border=FALSE)
#Add fitted variables
metadata_fit_DNA = envfit(ordination_functions.NMDS.unifrac, metadf_functions[c("DOC_SAC340", "fulvic_like_DOC", "humic_like_DOC", "Mg", "K", "Ca", "Silicate", "Conductivity", "Al", "Pb")])
plot(metadata_fit_DNA, cex=0.0001, col="black")


#TESTS ADONIS FOR NMDS PLOTS

metadata <- as(sample_data(DNA_functions), "data.frame")
adonis(phyloseq::distance(DNA_functions, method="bray") ~ Water_color,
       data = metadata)

metadata <- as(sample_data(new_DNA), "data.frame")
adonis(phyloseq::distance(new_DNA, method="bray") ~ Water_color,
       data = metadata)

metadata <- as(sample_data(new_RNA), "data.frame")
adonis(phyloseq::distance(new_RNA, method="bray") ~ Water_color,
       data = metadata)

### RANDOM FOREST TO DETECT ASVS ASSOCIATED TO BLACK WATER

library(randomForest)

#Format data for random forest: black versus others DNA1
new_DNA1 = new_DNA
sample_data(new_DNA1)$Water_color <- factor(sample_data(new_DNA1)$Water_color, levels = list(Black_water = "Black_water", White_water = c("White_water", "Clear_water")))
sample_data(new_DNA1)$Water_color <- as.character(sample_data(new_DNA1)$Water_color)
sample_data(new_DNA1)$Water_color[is.na(sample_data(new_DNA1)$Water_color)] <- "White_or_Clear"
## Implement Random forest algorithm DNA1
#Prepare data for RF 
ntaxa(new_DNA1)
prunescale = 0.001
minlib = 5000
tax.mean = taxa_sums(new_DNA1)/nsamples(new_DNA1)
all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, new_DNA1)
ntaxa(all_muc_prune)
predictors = t(otu_table(all_muc_prune))
dim(predictors)
response <- as.factor(sample_data(all_muc_prune)$Water_color)
rf.data <- data.frame(response, predictors)
#Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression.
set.seed(2)
all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500)
print(all_muc.classify)
names(all_muc.classify)
#Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables.
imp <- importance(all_muc.classify)
imp <- data.frame(predictors = rownames(imp), imp)
imp.sort <- arrange(imp, desc(MeanDecreaseGini))
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors)
imp.50 <- imp.sort[1:40, ]
write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_DNA1.txt") 
otunames <- imp.50$predictors
r <- rownames(tax_table(new_DNA1)) %in% otunames
purif_otus = tax_table(new_DNA1)[r, ]
write.table(tax_table(new_DNA1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_DNA1.txt") 
RF_DNA1 = subset_taxa(new_DNA1, row.names(tax_table(new_DNA1)) %in% row.names(purif_otus))

#Format data for random forest: black versus others RNA1
new_RNA1 = new_RNA
sample_data(new_RNA1)$Water_color <- factor(sample_data(new_RNA1)$Water_color, levels = list(Black_water = "Black_water", White_water = c("White_water", "Clear_water")))
sample_data(new_RNA1)$Water_color <- as.character(sample_data(new_RNA1)$Water_color)
sample_data(new_RNA1)$Water_color[is.na(sample_data(new_RNA1)$Water_color)] <- "White_or_Clear"
## Implement Random forest algorithm RNA1
#Prepare data for RF 
ntaxa(new_RNA1)
prunescale = 0.001
minlib = 5000
tax.mean = taxa_sums(new_RNA1)/nsamples(new_RNA1)
all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, new_RNA1)
ntaxa(all_muc_prune)
predictors = t(otu_table(all_muc_prune))
dim(predictors)
response <- as.factor(sample_data(all_muc_prune)$Water_color)
rf.data <- data.frame(response, predictors)
#Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression.
set.seed(2)
all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500)
print(all_muc.classify)
names(all_muc.classify)
#Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables.
imp <- importance(all_muc.classify)
imp <- data.frame(predictors = rownames(imp), imp)
imp.sort <- arrange(imp, desc(MeanDecreaseGini))
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors)
imp.50 <- imp.sort[1:40, ]
write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_RNA1.txt") 
otunames <- imp.50$predictors
r <- rownames(tax_table(new_RNA1)) %in% otunames
purif_otus = tax_table(new_RNA1)[r, ]
write.table(tax_table(new_RNA1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_RNA1.txt") 
RF_RNA1 = subset_taxa(new_RNA1, row.names(tax_table(new_RNA1)) %in% row.names(purif_otus))

#Format data for random forest: black versus others DETAILED FUNCTIONS
DNA_functions1 = DNA_functions
sample_data(DNA_functions1)$Water_color <- factor(sample_data(DNA_functions1)$Water_color, levels = list(Black_water = "Black_water", White_water = c("White_water", "Clear_water")))
sample_data(DNA_functions1)$Water_color <- as.character(sample_data(DNA_functions1)$Water_color)
sample_data(DNA_functions1)$Water_color[is.na(sample_data(DNA_functions1)$Water_color)] <- "White_or_Clear"
## Implement Random forest algorithm DNA1
#Prepare data for RF 
ntaxa(DNA_functions1)
prunescale = 0.001
minlib = 5000
tax.mean = taxa_sums(DNA_functions1)/nsamples(DNA_functions1)
all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, DNA_functions1)
ntaxa(all_muc_prune)
predictors = t(otu_table(all_muc_prune))
dim(predictors)
response <- as.factor(sample_data(all_muc_prune)$Water_color)
rf.data <- data.frame(response, predictors)
#Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression.
set.seed(2)
all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500)
print(all_muc.classify)
names(all_muc.classify)
#Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables.
imp <- importance(all_muc.classify)
imp <- data.frame(predictors = rownames(imp), imp)
imp.sort <- arrange(imp, desc(MeanDecreaseGini))
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors)
imp.50 <- imp.sort[1:40, ]
write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_DNA_functions.txt") 
otunames <- imp.50$predictors
r <- rownames(tax_table(DNA_functions1)) %in% otunames
purif_otus = tax_table(DNA_functions1)[r, ]
write.table(tax_table(DNA_functions1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_DNA_functions.txt") 
RF_functions1 = subset_taxa(DNA_functions1, row.names(tax_table(DNA_functions1)) %in% row.names(purif_otus))

###Format data for random forest: black versus others LARGE SCALE FUNCTIONS
function_genus_true <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/large_scale_functions_DNA2.txt",  header=TRUE, row.names = 1, check.names = FALSE)
Export_Nsum_OTU_table_all_mucus <- as.data.frame(sample_data(new_DNA))
write.table(Export_Nsum_OTU_table_all_mucus, file="/Users/francois-etiennesylvain/Documents/Doctorat/meta_new_DNA.txt")
write.table(row.names(function_genus_true), file="/Users/francois-etiennesylvain/Documents/Doctorat/taxa_new_DNA.txt")

function_genus_true <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/large_scale_functions_DNA2.txt",  header=TRUE, row.names = 1, check.names = FALSE)
metadata_function <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/meta_new_DNA.txt",  header=TRUE, row.names = 1, check.names = FALSE)
function_taxonomy = read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/taxa_new_DNA.txt",  header=TRUE, row.names = 1, check.names = FALSE)
rownames(function_genus_true)=rownames(function_taxonomy)

large_functions_phyloseq <- phyloseq(otu_table(as.matrix(function_genus_true), taxa_are_rows=TRUE), sample_data(metadata_function), tax_table(as.matrix(function_taxonomy)))

large_functions_phyloseq1 = large_functions_phyloseq
sample_data(large_functions_phyloseq1)$Water_color <- factor(sample_data(large_functions_phyloseq1)$Water_color, levels = list(Black_water = "Black_water", White_water = c("White_water", "Clear_water")))
sample_data(large_functions_phyloseq1)$Water_color <- as.character(sample_data(large_functions_phyloseq1)$Water_color)
sample_data(large_functions_phyloseq1)$Water_color[is.na(sample_data(large_functions_phyloseq1)$Water_color)] <- "White_or_Clear"

#compute random forest analysis on large scale functions dataset
ntaxa(large_functions_phyloseq1)
prunescale = 0.001
minlib = 5000
tax.mean = taxa_sums(large_functions_phyloseq1)/nsamples(large_functions_phyloseq1)
all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, large_functions_phyloseq1)
ntaxa(all_muc_prune)
predictors = t(otu_table(all_muc_prune))
dim(predictors)
response <- as.factor(sample_data(all_muc_prune)$Water_color)
rf.data <- data.frame(response, predictors)
#Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression.
set.seed(2)
all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500)
print(all_muc.classify)
names(all_muc.classify)
#Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables.
imp <- importance(all_muc.classify)
imp <- data.frame(predictors = rownames(imp), imp)
imp.sort <- arrange(imp, desc(MeanDecreaseGini))
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors)
imp.50 <- imp.sort[1:40, ]
write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_DNA_functions_large_scale.txt") 
otunames <- imp.50$predictors
#r <- rownames(tax_table(large_functions_phyloseq1)) %in% otunames
#purif_otus = tax_table(large_functions_phyloseq1)[r, ]
#write.table(tax_table(large_functions_phyloseq1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_DNA_functions_large_scale.txt") 
#RF_DNA1 = subset_taxa(large_functions_phyloseq1, row.names(tax_table(large_functions_phyloseq1)) %in% row.names(purif_otus))

### faire le tri car plusieurs fonctions appartiennent à l'humain, aller les sélectionner manuellement pour faire une table d'ASVs avec

# extract the otu tables from phyloseq

otu_table_RF_DNA1 <- as.data.frame(otu_table(RF_DNA1))
write.table(otu_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_DNA.txt") 
tax_table_RF_DNA1 <- as.data.frame(tax_table(RF_DNA1))
write.table(tax_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_DNA.txt") 
meta_table_RF_DNA1 <- as.data.frame(sample_data(RF_DNA1))
write.table(meta_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_DNA.txt")

otu_table_RF_RNA1 <- as.data.frame(otu_table(RF_RNA1))
write.table(otu_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_RNA.txt") 
tax_table_RF_RNA1 <- as.data.frame(tax_table(RF_RNA1))
write.table(tax_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_RNA.txt") 
meta_table_RF_RNA1 <- as.data.frame(sample_data(RF_RNA1))
write.table(meta_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_RNA.txt")

otu_table_RF_functions <- as.data.frame(otu_table(RF_functions1))
write.table(otu_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_functions.txt") 
tax_table_RF_functions <- as.data.frame(tax_table(RF_functions1))
write.table(tax_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_functions.txt") 
meta_table_RF_functions <- as.data.frame(sample_data(RF_functions1))
write.table(meta_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_functions.txt")

# I reformatted manually the OTU tables to include info about water type + ASV tax + to change site 2 from black to clear water

# build heatmaps
library(RColorBrewer)
coul <- colorRampPalette(brewer.pal(8, "Greys"))(25)

combined_RF_DNA <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_combined_otu_tax_DNA.txt",  header=TRUE, row.names = 1, check.names = FALSE)
combined_RF_DNA_mat = as.matrix(t(combined_RF_DNA))
heatmap(combined_RF_DNA_mat, scale="row", Colv = NA, cexCol=0.5,cexRow=0.5, margins=c(11,11), col = coul)

combined_RF_RNA <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/combined_RF_otu_RNA.txt",  header=TRUE, row.names = 1, check.names = FALSE)
combined_RF_RNA_mat = as.matrix(t(combined_RF_RNA))
heatmap(combined_RF_RNA_mat, scale="row", Colv = NA, cexCol=0.5,cexRow=0.5, margins=c(11,11), col = coul)

combined_RF_functions <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/combined_RF_otu_functions.txt",  header=TRUE, row.names = 1, check.names = FALSE)
combined_RF_functions_mat = as.matrix(t(combined_RF_functions))
heatmap(combined_RF_functions_mat, scale="row", Colv = NA, cexCol=0.5,cexRow=0.5, margins=c(11,11), col = coul)

### Now we want to check of DOC is a major driver of community structure / transcription 
### Corr. to functionnal profile is done in the heatmap/lignin cycle analysis

### first we compute ordisurfs

#ordisurf DNA

meta_new_DNA = meta(new_DNA_merged)
meta_new_DNA$Water_color = c("Black","Clear", "Black", "White", "White", "Black", "White", "White", "Black", "White", "White", "Clear", "Clear", "Black", "White")
groups <- meta_new_DNA$Water_color #get grouping information from meta data

df=data.frame(x=ordination_DNA.NMDS.unifrac$point[,1],y=ordination_DNA.NMDS.unifrac$point[,2],Groups=groups)
#Add a dummy variable corrresponding to the selected variable
meta_new_DNA$var <- meta_new_DNA$Humic.tot # FES: p-e son format a lui est meilleur

#fit a surface for a selected variable onto ordination stats
ordi<- vegan::ordisurf(ordination_DNA.NMDS.unifrac,meta_new_DNA$var ,plot = FALSE, bs="ds")
ordi.grid <- ordi$grid #extracts the ordisurf object
#str(ordi.grid) #it's a list though - cannot be plotted as is
ordi.mite <- expand.grid(x = ordi.grid$x, y = ordi.grid$y) #get x and ys
ordi.mite$z <- as.vector(ordi.grid$z) #unravel the matrix for the z scores
ordi.mite.na <- data.frame(na.omit(ordi.mite)) #gets rid of the nas

#make the plot
p<-ggplot2::ggplot()+stat_contour(data = ordi.mite.na, aes(x = x, y = y, z = z, colour = ..level..), size = 1.8, positon="identity") #can change the binwidth depending on how many contours you want
p<-p+ ggplot2::geom_point(data=df,aes(x,y,fill=Groups),pch=21,size=12)
p<-p+ ggplot2::scale_fill_manual(values = c("gray25", "lightskyblue", "burlywood"))
p<-p+ ggplot2::scale_colour_continuous(high = "red4", low = "yellow2") #here we set the high and low of the colour scale.  Can delete to go back to the standard blue, or specify others
#p<-p+ ggplot2::labs(colour = paste(env.variable)) #another way to set the labels, in this case, for the colour legend
p<-p+ ggplot2::theme_bw()
p

# RNA

meta_new_RNA = meta(new_RNA_merged)
meta_new_RNA$Water_color = c("Black","Clear", "Black", "White", "White", "Black", "White", "White", "Black", "White", "White", "Clear", "Clear", "Black", "White")
groups <- meta_new_RNA$Water_color #get grouping information from meta data

df=data.frame(x=ordination_RNA.NMDS.unifrac$point[,1],y=ordination_RNA.NMDS.unifrac$point[,2],Groups=groups)
#Add a dummy variable corrresponding to the selected variable
meta_new_RNA$var <- meta_new_RNA$humic_like_DOC # FES: p-e son format a lui est meilleur

#fit a surface for a selected variable onto ordination stats
ordi<- vegan::ordisurf(ordination_RNA.NMDS.unifrac,meta_new_RNA$var ,plot = FALSE, bs="ds")
ordi.grid <- ordi$grid #extracts the ordisurf object
#str(ordi.grid) #it's a list though - cannot be plotted as is
ordi.mite <- expand.grid(x = ordi.grid$x, y = ordi.grid$y) #get x and ys
ordi.mite$z <- as.vector(ordi.grid$z) #unravel the matrix for the z scores
ordi.mite.na <- data.frame(na.omit(ordi.mite)) #gets rid of the nas

#make the plot
p<-ggplot2::ggplot()+stat_contour(data = ordi.mite.na, aes(x = x, y = y, z = z, colour = ..level..), size = 1.8, positon="identity") #can change the binwidth depending on how many contours you want
p<-p+ ggplot2::geom_point(data=df,aes(x,y,fill=Groups),pch=21,size=12)
p<-p+ ggplot2::scale_fill_manual(values = c("gray25", "lightskyblue", "burlywood"))
p<-p+ ggplot2::scale_colour_continuous(high = "red4", low = "yellow2") #here we set the high and low of the colour scale.  Can delete to go back to the standard blue, or specify others
#p<-p+ ggplot2::labs(colour = paste(env.variable)) #another way to set the labels, in this case, for the colour legend
p<-p+ ggplot2::theme_bw()
p


# Normalize data
DNA.relative  = transform_sample_counts(new_DNA, function(x) x / sum(x) )
RNA.relative  = transform_sample_counts(new_RNA, function(x) x / sum(x) )

### Select the most abundant ASVs (to speed the correlation calculation)

Abund_DNA = filter_taxa(DNA.relative, function(x) mean(x) > 0.00075, TRUE)
Abund_RNA = filter_taxa(RNA.relative, function(x) mean(x) > 0.00025, TRUE)

### Export bacteriopankton ASV/OTU tables and tax tables in .csv files

Export_ASV_table_DNA <- as.data.frame(otu_table(Abund_DNA))
write.csv(Export_ASV_table_DNA, file="ASV_table_abund_bacterioplankton_DNA_MARCH2021.csv")
Export_tax_table_DNA <- as.data.frame(tax_table(Abund_DNA))
write.csv(Export_tax_table_DNA, file="Tax_table_abund_bacterioplankton_DNA_MARCH2021.csv")
Export_metadata_table_DNA <- as.data.frame(sample_data(Abund_DNA))
write.csv(Export_metadata_table_DNA, file="Metadata_table_abund_bacterioplankton_DNA_MARCH2021.csv")

Export_ASV_table_RNA <- as.data.frame(otu_table(Abund_RNA))
write.csv(Export_ASV_table_RNA, file="ASV_table_abund_bacterioplankton_RNA_MARCH2021.csv")
Export_tax_table_RNA <- as.data.frame(tax_table(Abund_RNA))
write.csv(Export_tax_table_RNA, file="Tax_table_abund_bacterioplankton_RNA_MARCH2021.csv")
Export_metadata_table_RNA <- as.data.frame(sample_data(Abund_RNA))
write.csv(Export_metadata_table_RNA, file="Metadata_table_abund_bacterioplankton_RNA_MARCH2021.csv")


### Compute Spearman correlations.

DNA.network<-read.table(file="Combined_ASV_table_abund_bacterioplankton_DNA_MARCH2021.txt",  header=TRUE, row.names=1)
RNA.network<-read.table(file="combined_ASV_table_abund_bacterioplankton_RNA_MARCH2021.txt",  header=TRUE, row.names=1)

## Correct the dataframe format (the number in [] is the number of columns (i.e. ASVs) in the ASV table)

DNA.network=sapply(DNA.network[1:133],as.numeric)
RNA.network=sapply(RNA.network[1:646],as.numeric)

# Calculate Spearman correlation values

corr_DNA <- rcorr(DNA.network, type="spearman")
corr_RNA <- rcorr(RNA.network, type="spearman")

# Extract, correct and filter p.values + correlations

gene <- colnames(corr_DNA$P)
Sample.comparison <- paste(gene[row(corr_DNA$P)], gene[col(corr_DNA$P)], sep=".")
i <- lower.tri(corr_DNA$P)
Dat <- data.frame(Sample.comparison[i], p.value=corr_DNA$P[i])
Dat$Bonferroni <- p.adjust(Dat$p.value, method="bonferroni")
gene <- colnames(corr_DNA$r)
Sample.comparison <- paste(gene[row(corr_DNA$r)], gene[col(corr_DNA$r)], sep=".")
i <- lower.tri(corr_DNA$r)
Correlation <- data.frame(Sample.comparison[i], correlation.spearman=corr_DNA$r[i], Dat$Bonferroni)
Filt_Bonferroni <- subset(Correlation, Dat.Bonferroni < 0.05 & correlation.spearman > 0.5)
write.csv(Filt_Bonferroni, file="combined_p_values_Network_abund_DNA_MARCH2021.csv")


gene <- colnames(corr_RNA$P)
Sample.comparison <- paste(gene[row(corr_RNA$P)], gene[col(corr_RNA$P)], sep=".")
i <- lower.tri(corr_RNA$P)
Dat <- data.frame(Sample.comparison[i], p.value=corr_RNA$P[i])
Dat$Bonferroni <- p.adjust(Dat$p.value, method="bonferroni")
gene <- colnames(corr_RNA$r)
Sample.comparison <- paste(gene[row(corr_RNA$r)], gene[col(corr_RNA$r)], sep=".")
i <- lower.tri(corr_RNA$r)
Correlation <- data.frame(Sample.comparison[i], correlation.spearman=corr_RNA$r[i], Dat$Bonferroni)
Filt_Bonferroni <- subset(Correlation, Dat.Bonferroni < 0.05 & correlation.spearman > 0.5)
write.csv(Filt_Bonferroni, file="combined_p_values_Network_abund_RNA_MARCH2021.csv")


####RAREFACTION ANALYSIS

### Modified script from https://github.com/joey711/phyloseq/issues/143

psdata <- new_RNA

#set.seed(42)

calculate_rarefaction_curves <- function(psdata, measures, depths) {
  require('plyr') # ldply
  require('reshape2') # melt
  
  estimate_rarified_richness <- function(psdata, measures, depth) {
    if(max(sample_sums(psdata)) < depth) return()
    psdata <- prune_samples(sample_sums(psdata) >= depth, psdata)
    
    rarified_psdata <- rarefy_even_depth(psdata, depth, verbose = FALSE)
    
    alpha_diversity <- estimate_richness(rarified_psdata, measures = measures)
    
    # as.matrix forces the use of melt.array, which includes the Sample names (rownames)
    molten_alpha_diversity <- melt(as.matrix(alpha_diversity), varnames = c('Sample', 'Measure'), value.name = 'Alpha_diversity')
    
    molten_alpha_diversity
  }
  
  names(depths) <- depths # this enables automatic addition of the Depth to the output by ldply
  rarefaction_curve_data <- ldply(depths, estimate_rarified_richness, psdata = psdata, measures = measures, .id = 'Depth', .progress = ifelse(interactive(), 'text', 'none'))
  
  # convert Depth from factor to numeric
  rarefaction_curve_data$Depth <- as.numeric(levels(rarefaction_curve_data$Depth))[rarefaction_curve_data$Depth]
  
  rarefaction_curve_data
}

#rarefaction_curve_data <- calculate_rarefaction_curves(psdata, c('Observed', 'Shannon'), rep(c(1, 10, 100, 1000, 1:100 * 10000), each = 10))
rarefaction_curve_data <- calculate_rarefaction_curves(psdata, 'Shannon', rep(c(1, 10, 100, 1000, 1:100 * 10000), each = 10))
summary(rarefaction_curve_data)

rarefaction_curve_data_summary <- ddply(rarefaction_curve_data, c('Depth', 'Sample'), summarise, Alpha_diversity_mean = mean(Alpha_diversity), Alpha_diversity_sd = sd(Alpha_diversity))

rarefaction_curve_data_summary$Sample <- gsub("X", "", rarefaction_curve_data_summary$Sample)

data1 = data.frame(sample_data(psdata))
row.names(data1) <- gsub('-', '-', row.names(data1))
data1

rarefaction_curve_data_summary_verbose <- merge(rarefaction_curve_data_summary, data1, by.x = 'Sample', by.y = 'row.names')

g <- ggplot(
  data = rarefaction_curve_data_summary_verbose,
  mapping = aes(
    x = Depth,
    y = Alpha_diversity_mean,
    ymin = Alpha_diversity_mean - Alpha_diversity_sd,
    ymax = Alpha_diversity_mean + Alpha_diversity_sd,
    colour = Site,
    group = Sample
  )
) + geom_line(
) + geom_pointrange(
) + facet_wrap(
  facets = ~Site,
  scales = 'free_y'
)

ggsave("/Users/francois-etiennesylvain/Documents/Doctorat/rarefaction_plot_bacterioplankton_RNA.jpg", plot = g, width = 24, height = 24, units = "in")


#### Check whether we can find lignin degradation pathways in the fungi of the metagenomic database

ALL_LIGNIN_DEGRAD<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/lignin_degradation_pathways.txt",  header=TRUE)
FUNGI_PATHWAYS<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/AMZN_TAX_GO_KO_FUNGI.txt",  header=TRUE)

MY_LIGNIN_FUNGI = subset(FUNGI_PATHWAYS, KO_ID %in% ALL_LIGNIN_DEGRAD$KEGG_ID)
write.csv(MY_LIGNIN_FUNGI, file="/Users/francois-etiennesylvain/Documents/Doctorat/FUNGI_LIGNIN_PATHWAYS.csv")


#### Compute RDA instead of NMDS = more accurate + we can control for VIF (variance inflation factors)

# Source = http://www.hiercourse.com/docs/microbial/04_betaDiversity_multTables.html

# RDA with the DNA data

# extract the OTU table
Y <- veganotu(new_DNA.relative)
# extract the sample data from the 'phyloseq' object then remove the 'SampleID' column
dat <- data.frame(sample_data(new_DNA.relative))
#dat <- select(dat, -SampleID)
# select the continuous variables to use in the constraint (Y) then standardise
X <- select_if(dat, is.numeric)
X <- decostand(X, method='standardize')
# RDA
#res <- rda(Y ~ ., data=X)
#rda_plot = plot(res, xlab=ord_labels(res)[1], ylab=ord_labels(res)[2])

# also test dbRDA 
# note the result is the same as for RDA when using euclidean distances, so we will use bray here
res2 <- capscale(Y ~ ., data=X, distance='bray')
dbrda_plot = plot(res, xlab=ord_labels(res2)[1], ylab=ord_labels(res2)[2])

# permutation test of statistical significance
anova(res)

# calculate variance inflation variables with scores >10 are redundant
sort(vif.cca(res2))
# calculate fit
envfit(Y ~ ., data=X)
# set up full and null models for 'ordistep' full model
rda1 <- rda(Y ~ ., data=X)
# intercept-only (null) model
rda0 <- rda(Y ~ 1, data=X)
# perform forward and backward selection of explanatory variables (output not shown)
step.env <- ordistep(rda0, scope=formula(rda1), direction='both')   ###REALLY LONG TO COMPUTE ####

# look at the significant variables 
step.env$anova
# code to get variable names from 'ordistep' and 'envfit' results
vars.ordistep <- gsub('^. ', '', rownames(step.env$anova))
vars.envfit <- names(which(vif.cca(res) <= 10))
vars <- unique(c(vars.ordistep, vars.envfit))
# select variables to keep from table 'Y'
X1 <- X[, vars]
str(X1)

# RDA with fewer variables (selected variables)
res <- rda(Y ~ ., data=X1)
# summary of the results
summary(res, display=NULL)
# permutation test of statistical significance
anova(res)

# load library
library(gridExtra)

# set up dataframes for plotting the results
sit <- cbind(dat, scores(res, display='sites'))
spp <- cbind(data.frame(tax_table(new_DNA.relative)), scores(res, display='species'))
vec <- data.frame(scores(res, display='bp'))

# use these to adjust length of arrows and position of arrow labels
adj.vec <- 0.30  # 0.4
adj.txt <- 0.33  # 0.45

p1 <- ggplot(sit, aes(x=RDA1, y=RDA2, color=Water_color, shape=Water_color)) +
  scale_colour_manual(values = c("Black_water" = "gray25", "Clear_water" = "lightskyblue", "White_water" = "burlywood"))+
  geom_point(size=3) + 
  geom_segment(data=vec, inherit.aes=F, alpha = 0.5,
               mapping=aes(x=0, y=0, xend=adj.vec*RDA1, yend=adj.vec*RDA2), 
               arrow=arrow(length=unit(0.2, 'cm'))) + 
  geom_text(data=vec, inherit.aes=F, cex=0.001,
            mapping=aes(x=adj.txt*RDA1, y=adj.txt*RDA2, 
                        label=c('Humic_DOC', 'DOC_Abs254/365', 'pH', 'Co', 'Zn', 'Cd', 'Mg', 'Total_DOC', 'Fulvic_DOC', 'DOC_SAC340', 'Conductivity', 'NO3', 'Na'))) +
  theme_bw()
p1$labels$x <- ord_labels(res)[1]
p1$labels$y <- ord_labels(res)[2]
p1


###### RDA with RNA data

# extract the OTU table
Y2 <- veganotu(new_RNA.relative)
# extract the sample data from the 'phyloseq' object then remove the 'SampleID' column
dat2 <- data.frame(sample_data(new_RNA.relative))
#dat <- select(dat, -SampleID)
# select the continuous variables to use in the constraint (Y) then standardise
X2 <- select_if(dat2, is.numeric)
X2 <- decostand(X2, method='standardize')
# RDA
#res2 <- rda(Y2 ~ ., data=X2)
#rda_plot2 = plot(res2, xlab=ord_labels(res2)[1], ylab=ord_labels(res2)[2])

# also test dbRDA 
# note the result is the same as for RDA when using euclidean distances, so we will use bray here
res3 <- capscale(Y2 ~ ., data=X2, distance='bray')
dbrda_plot = plot(res3, xlab=ord_labels(res3)[1], ylab=ord_labels(res3)[2])

# permutation test of statistical significance
anova(res3)

# calculate variance inflation variables with scores >10 are redundant
sort(vif.cca(res3))
# calculate fit
envfit(Y2 ~ ., data=X2)
# set up full and null models for 'ordistep' full model
rda12 <- rda(Y2 ~ ., data=X2)
# intercept-only (null) model
rda02 <- rda(Y2 ~ 1, data=X2)
# perform forward and backward selection of explanatory variables (output not shown)
##step.env2 <- ordistep(rda02, scope=formula(rda12), direction='both')   ###REALLY LONG TO COMPUTE ####

# look at the significant variables 
step.env2$anova
# code to get variable names from 'ordistep' and 'envfit' results
vars.ordistep2 <- gsub('^. ', '', rownames(step.env2$anova))
vars.envfit2 <- names(which(vif.cca(res3) <= 10))
vars2 <- unique(c(vars.ordistep2, vars.envfit2))
# select variables to keep from table 'Y'
X12 <- X2[, vars2]
str(X12)

# RDA with fewer variables (selected variables)
res3 <- capscale(Y2 ~ ., data=X12, distance='bray')
# summary of the results
summary(res3, display=NULL)
# permutation test of statistical significance
anova(res3)

# set up dataframes for plotting the results
sit2 <- cbind(dat2, scores(res3, display='sites'))
spp2 <- cbind(data.frame(tax_table(new_RNA.relative)), scores(res3, display='species'))
vec2 <- data.frame(scores(res3, display='bp'))

# use these to adjust length of arrows and position of arrow labels
adj.vec <- 3.0
adj.txt <- 3.3

p2 <- ggplot(sit2, aes(x=CAP1, y=CAP2, color=Water_color, shape=Water_color)) +
  scale_colour_manual(values = c("Black_water" = "gray25", "Clear_water" = "lightskyblue", "White_water" = "burlywood"))+
  geom_point(size=3) + 
  geom_segment(data=vec2, inherit.aes=F, alpha = 0.5,
               mapping=aes(x=0, y=0, xend=adj.vec*CAP1, yend=adj.vec*CAP2), 
               arrow=arrow(length=unit(0.2, 'cm'))) + 
  geom_text(data=vec2, inherit.aes=F, cex=0.00001,
            mapping=aes(x=adj.txt*CAP1, y=adj.txt*CAP2, 
                        label=c('Ca', 'DOC_Abs254/365', 'K', 'Fulvic_DOC', 'Mg', 'Protein_DOC', 'Co', 'Cd', 'Pb' ))) +
  theme_bw()
p2$labels$x <- ord_labels(res3)[1]
p2$labels$y <- ord_labels(res3)[2]
p2


##### RDA with FUNCTIONS data

DNA_functions.relative <- transform_sample_counts(DNA_functions, function(otu) otu/sum(otu))

# extract the OTU table
Y4 <- veganotu(DNA_functions.relative)
# extract the sample data from the 'phyloseq' object then remove the 'SampleID' column
dat4 <- data.frame(sample_data(DNA_functions.relative))
#dat <- select(dat, -SampleID)
# select the continuous variables to use in the constraint (Y) then standardise
X4 <- select_if(dat4, is.numeric)
X4 <- decostand(X4, method='standardize')
# RDA
#res4 <- rda(Y4 ~ ., data=X4)
#rda_plot4 = plot(res4, xlab=ord_labels(res4)[1], ylab=ord_labels(res4)[2])

# also test dbRDA 
# note the result is the same as for RDA when using euclidean distances, so we will use bray here
res4 <- capscale(Y4 ~ ., data=X4, distance='bray')
dbrda_plot = plot(res4, xlab=ord_labels(res4)[1], ylab=ord_labels(res4)[2])

# permutation test of statistical significance
anova(res4)

# calculate variance inflation variables with scores >10 are redundant
sort(vif.cca(res4))
# calculate fit
envfit(Y4 ~ ., data=X4)
# set up full and null models for 'ordistep' full model
rda14 <- rda(Y4 ~ ., data=X4)
# intercept-only (null) model
rda04 <- rda(Y4 ~ 1, data=X4)
# perform forward and backward selection of explanatory variables (output not shown)
step.env4 <- ordistep(rda04, scope=formula(rda14), direction='both')   ###REALLY LONG TO COMPUTE ####

# look at the significant variables 
step.env4$anova
# code to get variable names from 'ordistep' and 'envfit' results
vars.ordistep4 <- gsub('^. ', '', rownames(step.env4$anova))
vars.envfit4 <- names(which(vif.cca(res4) <= 10))
vars4 <- unique(c(vars.ordistep4, vars.envfit4))
# select variables to keep from table 'Y'
X14 <- X4[, vars4]
str(X14)

# RDA with fewer variables (selected variables)
res4 <- capscale(Y4 ~ ., data=X14, distance='bray')
# summary of the results
summary(res4, display=NULL)
# permutation test of statistical significance
anova(res4)

# set up dataframes for plotting the results
sit4 <- cbind(dat4, scores(res4, display='sites'))
spp4 <- cbind(data.frame(tax_table(DNA_functions.relative)), scores(res4, display='species'))
vec4 <- data.frame(scores(res4, display='bp'))

# use these to adjust length of arrows and position of arrow labels
adj.vec <- 2.0
adj.txt <- 2.2

p2 <- ggplot(sit4, aes(x=CAP1, y=CAP2, color=Water_color, shape=Water_color)) +
  scale_colour_manual(values = c("Black_water" = "gray25", "Clear_water" = "lightskyblue", "White_water" = "burlywood"))+
  geom_point(size=3) + 
  geom_segment(data=vec4, inherit.aes=F, alpha = 0.5,
               mapping=aes(x=0, y=0, xend=adj.vec*CAP1, yend=adj.vec*CAP2), 
               arrow=arrow(length=unit(0.2, 'cm'))) + 
  geom_text(data=vec4, inherit.aes=F, cex=0.0001,
            mapping=aes(x=adj.txt*CAP1, y=adj.txt*CAP2, 
                        label=c('Ca', 'pH', 'Fe', 'DOC_Abs254/365', 'K', 'Cd', 'Silicate', 'Mn'))) +
  theme_bw()
p2$labels$x <- ord_labels(res4)[1]
p2$labels$y <- ord_labels(res4)[2]
p2


### RANDOM FOREST TO DETECT ASVS ASSOCIATED TO WATER COLORS (WITHOUT COMBINING WHITE/CLEAR)

library(randomForest)

#Format data for random forest: black versus others DNA1
new_DNA1 = new_DNA

## Implement Random forest algorithm DNA1
#Prepare data for RF 
ntaxa(new_DNA1)
prunescale = 0.001
minlib = 5000
tax.mean = taxa_sums(new_DNA1)/nsamples(new_DNA1)
all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, new_DNA1)
ntaxa(all_muc_prune)
predictors = t(otu_table(all_muc_prune))
dim(predictors)
response <- as.factor(sample_data(all_muc_prune)$Water_color)
rf.data <- data.frame(response, predictors)
#Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression.
set.seed(2)
all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500)
print(all_muc.classify)
names(all_muc.classify)
#Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables.
imp <- importance(all_muc.classify)
imp <- data.frame(predictors = rownames(imp), imp)
imp.sort <- arrange(imp, desc(MeanDecreaseGini))
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors)
imp.50 <- imp.sort[1:40, ]
write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_DNA1_NOT_COMBINED.txt") 
otunames <- imp.50$predictors
r <- rownames(tax_table(new_DNA1)) %in% otunames
purif_otus = tax_table(new_DNA1)[r, ]
write.table(tax_table(new_DNA1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_DNA1_NOT_COMBINED.txt") 
RF_DNA1 = subset_taxa(new_DNA1, row.names(tax_table(new_DNA1)) %in% row.names(purif_otus))

#Format data for random forest: black versus others RNA1
new_RNA1 = new_RNA
## Implement Random forest algorithm RNA1
#Prepare data for RF 
ntaxa(new_RNA1)
prunescale = 0.001
minlib = 5000
tax.mean = taxa_sums(new_RNA1)/nsamples(new_RNA1)
all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, new_RNA1)
ntaxa(all_muc_prune)
predictors = t(otu_table(all_muc_prune))
dim(predictors)
response <- as.factor(sample_data(all_muc_prune)$Water_color)
rf.data <- data.frame(response, predictors)
#Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression.
set.seed(2)
all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500)
print(all_muc.classify)
names(all_muc.classify)
#Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables.
imp <- importance(all_muc.classify)
imp <- data.frame(predictors = rownames(imp), imp)
imp.sort <- arrange(imp, desc(MeanDecreaseGini))
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors)
imp.50 <- imp.sort[1:40, ]
write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_RNA1_NOT_COMBINED.txt") 
otunames <- imp.50$predictors
r <- rownames(tax_table(new_RNA1)) %in% otunames
purif_otus = tax_table(new_RNA1)[r, ]
write.table(tax_table(new_RNA1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_RNA1_NOT_COMBINED.txt") 
RF_RNA1 = subset_taxa(new_RNA1, row.names(tax_table(new_RNA1)) %in% row.names(purif_otus))

#Format data for random forest: black versus others DETAILED FUNCTIONS
DNA_functions1 = DNA_functions
## Implement Random forest algorithm DNA1
#Prepare data for RF 
ntaxa(DNA_functions1)
prunescale = 0.001
minlib = 5000
tax.mean = taxa_sums(DNA_functions1)/nsamples(DNA_functions1)
all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, DNA_functions1)
ntaxa(all_muc_prune)
predictors = t(otu_table(all_muc_prune))
dim(predictors)
response <- as.factor(sample_data(all_muc_prune)$Water_color)
rf.data <- data.frame(response, predictors)
#Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression.
set.seed(2)
all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500)
print(all_muc.classify)
names(all_muc.classify)
#Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables.
imp <- importance(all_muc.classify)
imp <- data.frame(predictors = rownames(imp), imp)
imp.sort <- arrange(imp, desc(MeanDecreaseGini))
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors)
imp.50 <- imp.sort[1:40, ]
write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_DNA_functions_NOT_COMBINED.txt") 
otunames <- imp.50$predictors
r <- rownames(tax_table(DNA_functions1)) %in% otunames
purif_otus = tax_table(DNA_functions1)[r, ]
write.table(tax_table(DNA_functions1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_DNA_functions_NOT_COMBINED.txt") 
RF_functions1 = subset_taxa(DNA_functions1, row.names(tax_table(DNA_functions1)) %in% row.names(purif_otus))

# extract the otu tables from phyloseq

otu_table_RF_DNA1 <- as.data.frame(otu_table(RF_DNA1))
write.table(otu_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_DNA_NOT_COMBINED.txt") 
tax_table_RF_DNA1 <- as.data.frame(tax_table(RF_DNA1))
write.table(tax_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_DNA_NOT_COMBINED.txt") 
meta_table_RF_DNA1 <- as.data.frame(sample_data(RF_DNA1))
write.table(meta_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_DNA_NOT_COMBINED.txt")

otu_table_RF_RNA1 <- as.data.frame(otu_table(RF_RNA1))
write.table(otu_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_RNA_NOT_COMBINED.txt") 
tax_table_RF_RNA1 <- as.data.frame(tax_table(RF_RNA1))
write.table(tax_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_RNA_NOT_COMBINED.txt") 
meta_table_RF_RNA1 <- as.data.frame(sample_data(RF_RNA1))
write.table(meta_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_RNA_NOT_COMBINED.txt")

otu_table_RF_functions <- as.data.frame(otu_table(RF_functions1))
write.table(otu_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_functions_NOT_COMBINED.txt") 
tax_table_RF_functions <- as.data.frame(tax_table(RF_functions1))
write.table(tax_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_functions_NOT_COMBINED.txt") 
meta_table_RF_functions <- as.data.frame(sample_data(RF_functions1))
write.table(meta_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_functions_NOT_COMBINED.txt")

# I reformatted manually the OTU tables to include info about water type + ASV taxonomy to the ASV abundance table

# build heatmaps
library(RColorBrewer)
coul <- colorRampPalette(brewer.pal(8, "Greys"))(25)

combined_RF_DNA <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_combined_otu_tax_DNA_NOT_COMBINED.txt",  header=TRUE, row.names = 1, check.names = FALSE)
combined_RF_DNA_mat = as.matrix(combined_RF_DNA)
heatmap(combined_RF_DNA_mat, scale="row", Rowv = NA, Colv = NA, cexCol=0.5,cexRow=0.75, margins=c(11,11), col = coul)

combined_RF_RNA <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/combined_RF_otu_RNA_NOT_COMBINED.txt",  header=TRUE, row.names = 1, check.names = FALSE)
combined_RF_RNA_mat = as.matrix(combined_RF_RNA)
heatmap(combined_RF_RNA_mat, scale="row", Rowv = NA, Colv = NA, cexCol=0.5,cexRow=0.75, margins=c(11,11), col = coul)

combined_RF_functions <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/combined_RF_otu_functions_NOT_COMBINED.txt",  header=TRUE, row.names = 1, check.names = FALSE)
combined_RF_functions_mat = as.matrix(combined_RF_functions)
heatmap(combined_RF_functions_mat, scale="row", Rowv = NA, Colv = NA, cexCol=0.5,cexRow=0.75, margins=c(11,11), col = coul)

#### Diversity of DNA

DNA_div = plot_richness(new_DNA, x="Water_color", color="Site", measures=c("Chao1", "Shannon"))
DNA_div + geom_point(size=3, alpha=0.7)

RNA_div = plot_richness(new_RNA, x="Water_color", color="Site", measures=c("Chao1", "Shannon"))
RNA_div + geom_point(size=3, alpha=0.7)

#### Barplot of relative abundance

DNA.aglo = tax_glom(new_DNA, taxrank = "Phylum")
DNA.dataframe = psmelt(DNA.aglo)
ggplot(DNA.dataframe, aes(x=Water_color, y=Abundance, fill=Phylum)) + geom_bar(stat="identity", position="fill") + ggtitle ("Taxonomic structure, Phylum Level")

RNA.aglo = tax_glom(new_RNA, taxrank = "Phylum")
RNA.dataframe = psmelt(RNA.aglo)
ggplot(RNA.dataframe, aes(x=Water_color, y=Abundance, fill=Phylum)) + geom_bar(stat="identity", position="fill")+ ggtitle ("Transcriptional activity, Phylum Level")
#+ facet_grid(~Water_color, scale="free")


### Recompute ordisurf with RDA instead of nmds

#ordisurf DNA

meta_new_DNA = meta(new_DNA)
groups <- meta_new_DNA$Water_color #get grouping information from meta data

sit1 <- scores(res2, display='sites')

df=data.frame(x=sit1[,1],y=sit1[,2],Groups=groups)
#Add a dummy variable corrresponding to the selected variable
meta_new_DNA$var <- meta_new_DNA$Humic.tot # FES: p-e son format a lui est meilleur

#fit a surface for a selected variable onto ordination stats
ordi<- vegan::ordisurf(res2,meta_new_DNA$var ,plot = FALSE, bs="ds")
ordi.grid <- ordi$grid #extracts the ordisurf object
#str(ordi.grid) #it's a list though - cannot be plotted as is
ordi.mite <- expand.grid(x = ordi.grid$x, y = ordi.grid$y) #get x and ys
ordi.mite$z <- as.vector(ordi.grid$z) #unravel the matrix for the z scores
ordi.mite.na <- data.frame(na.omit(ordi.mite)) #gets rid of the nas

#make the plot
p_DNA<-ggplot2::ggplot()+stat_contour(data = ordi.mite.na, aes(x = x, y = y, z = z, colour = ..level..), size = 1.8, positon="identity") #can change the binwidth depending on how many contours you want
p_DNA<-p_DNA+ ggplot2::geom_point(data=df,aes(x,y,fill=Groups),pch=21,size=4)
p_DNA<-p_DNA+ ggplot2::scale_fill_manual(values = c("gray25", "lightskyblue", "burlywood"))
p_DNA<-p_DNA+ ggplot2::scale_colour_continuous(high = "red4", low = "yellow2") #here we set the high and low of the colour scale.  Can delete to go back to the standard blue, or specify others
#p<-p+ ggplot2::labs(colour = paste(env.variable)) #another way to set the labels, in this case, for the colour legend
p_DNA<-p_DNA+ ggplot2::theme_bw()
p_DNA


#ordisurf RNA

meta_new_RNA = meta(new_RNA)
groups <- meta_new_RNA$Water_color #get grouping information from meta data

sit2 <- scores(res3, display='sites')

df=data.frame(x=sit2[,1],y=sit2[,2],Groups=groups)
#Add a dummy variable corrresponding to the selected variable
meta_new_RNA$var <- meta_new_RNA$humic_like_DOC # FES: p-e son format a lui est meilleur

#fit a surface for a selected variable onto ordination stats
ordi<- vegan::ordisurf(res3,meta_new_RNA$var ,plot = FALSE, bs="ds")
ordi.grid <- ordi$grid #extracts the ordisurf object
#str(ordi.grid) #it's a list though - cannot be plotted as is
ordi.mite <- expand.grid(x = ordi.grid$x, y = ordi.grid$y) #get x and ys
ordi.mite$z <- as.vector(ordi.grid$z) #unravel the matrix for the z scores
ordi.mite.na <- data.frame(na.omit(ordi.mite)) #gets rid of the nas

#make the plot
p_RNA<-ggplot2::ggplot()+stat_contour(data = ordi.mite.na, aes(x = x, y = y, z = z, colour = ..level..), size = 1.8, positon="identity") #can change the binwidth depending on how many contours you want
p_RNA<-p_RNA+ ggplot2::geom_point(data=df,aes(x,y,fill=Groups),pch=21,size=4)
p_RNA<-p_RNA+ ggplot2::scale_fill_manual(values = c("gray25", "lightskyblue", "burlywood"))
p_RNA<-p_RNA+ ggplot2::scale_colour_continuous(high = "red4", low = "yellow2") #here we set the high and low of the colour scale.  Can delete to go back to the standard blue, or specify others
#p<-p+ ggplot2::labs(colour = paste(env.variable)) #another way to set the labels, in this case, for the colour legend
p_RNA<-p_RNA+ ggplot2::theme_bw()
p_RNA

#### Make betadisper test to check which water color has the most dispersion

Distances_DNA = phyloseq::distance(new_DNA, method = "bray")
metadf_DNA = data.frame(sample_data(new_DNA))
disp.bray.watercolor.DNA = betadisper(Distances_DNA, metadf_DNA$Water_color)
permutest(disp.bray.watercolor.DNA, pairwise=TRUE, permutations=1000)

Distances_RNA = phyloseq::distance(new_RNA, method = "bray")
metadf_RNA = data.frame(sample_data(new_RNA))
disp.bray.watercolor.RNA = betadisper(Distances_RNA, metadf_RNA$Water_color)
permutest(disp.bray.watercolor.RNA, pairwise=TRUE, permutations=1000)

Distances_functions = phyloseq::distance(functions_phyloseq, method = "bray")
metadf_functions = data.frame(sample_data(functions_phyloseq))
disp.bray.watercolor.functions = betadisper(Distances_functions, metadf_functions$Water_color)
permutest(disp.bray.watercolor.functions, pairwise=TRUE, permutations=1000)