### Load the packages #install.packages("tidyr") #install_github("jfq3/QsRutils", force = TRUE) library(QsRutils) #library(dada2) library(DESeq2) #library(phangorn) #library(plotly) #library(gplots) library(decontam) library(DescTools) library(multtest) library(foreach) library(stringi) library(Hmisc) library(parallel) library(iterators) library(phyloseq) library(ape) library(ggplot2) library(plyr) library(gridExtra) library(lsmeans) library(multcompView) library(vegan) library(dplyr) library(devtools) library(metagMisc) library(metagenomeSeq) library(microbiome) library(btools) library(plyr) library(reshape2) library(theseus) #library(phangorn) library(microbiomeSeq) library(tidyr) #library(DECIPHER) ##load the new dataset load("/Users/francois-etiennesylvain/Documents/Doctorat/full_decont_phyloseq.RData") ##remove ASVs < 1 read my_phyloseq_filt_no_single <- filter_taxa(my_phyloseq_filt, function (x) {sum(x) > 1}, prune=TRUE) ##add ecosystem variable meta_my_phyloseq_filt_no_single = sample_data(my_phyloseq_filt_no_single) #write.table(meta_my_phyloseq_filt_no_single, file = "/Users/francois-etiennesylvain/Documents/Doctorat/meta_bacterioplankton_extract.txt") ecosystem_variable <- read.table("/Users/francois-etiennesylvain/Documents/Doctorat/meta_bacterioplankton_extract_mod.txt", sep = "\t", header = TRUE) ecosystem_variable1 = ecosystem_variable[, 11] sample_data(my_phyloseq_filt_no_single)$Ecosystem <- unlist(ecosystem_variable1) ##separate DNA from RNA in new dataset new_DNA = subset_samples(my_phyloseq_filt_no_single, TYPE %in% "DNA") new_RNA = subset_samples(my_phyloseq_filt_no_single, TYPE %in% "RNA") ### after separation, remove once again ASVs with less than 1 read new_DNA <- filter_taxa(new_DNA, function (x) {sum(x) > 1}, prune=TRUE) new_RNA <- filter_taxa(new_RNA, function (x) {sum(x) > 1}, prune=TRUE) ##Normalize data new_DNA.relative <- transform_sample_counts(new_DNA, function(otu) otu/sum(otu)) new_RNA.relative <- transform_sample_counts(new_RNA, function(otu) otu/sum(otu)) #my_phyloseq_filt_no_single.relative <- transform_sample_counts(my_phyloseq_filt_no_single.relative, function(otu) otu/sum(otu)) ### Polynucleobacter barplot poly.DNA <- subset_taxa(new_DNA.relative, Genus == "Polynucleobacter") poly.RNA <- subset_taxa(new_RNA.relative, Genus == "Polynucleobacter") #Make a function for barplots without the black lines ###### my_plot_bar = function (physeq, x = "Sample", y = "Abundance", fill = NULL, title = NULL, facet_grid = NULL) { mdf = psmelt(physeq) p = ggplot(mdf, aes_string(x = x, y = y, fill = fill)) p = p + geom_bar(stat = "identity", position = "stack") p = p + theme(axis.text.x = element_text(angle = -90, hjust = 0)) if (!is.null(facet_grid)) { p <- p + facet_grid(facet_grid) } if (!is.null(title)) { p <- p + ggtitle(title) } return(p) } poly.DNA_merged = merge_samples(poly.DNA, "Water_color") my_plot_bar(poly.DNA_merged, fill ="Water_color") #+ scale_fill_manual(values = palette_2) a <- rowSums(otu_table(poly.DNA_merged)) poly.RNA_merged = merge_samples(poly.RNA, "Water_color") my_plot_bar(poly.RNA_merged, fill ="Water_color") #+ scale_fill_manual(values = palette_2) a <- rowSums(otu_table(poly.RNA_merged)) ### Exosystem variable # In black waters new_DNA_black = subset_samples(new_DNA, Water_color %in% c("Black_water")) metadata <- as(sample_data(new_DNA_black), "data.frame") adonis(phyloseq::distance(new_DNA_black, method="bray") ~ Ecosystem, data = metadata) new_RNA_black = subset_samples(new_RNA, Water_color %in% c("Black_water")) metadata <- as(sample_data(new_RNA_black), "data.frame") adonis(phyloseq::distance(new_RNA_black, method="bray") ~ Ecosystem, data = metadata) # In white waters new_DNA_white = subset_samples(new_DNA, Water_color %in% c("White_water")) metadata <- as(sample_data(new_DNA_white), "data.frame") adonis(phyloseq::distance(new_DNA_white, method="bray") ~ Ecosystem, data = metadata) new_RNA_white = subset_samples(new_RNA, Water_color %in% c("White_water")) metadata <- as(sample_data(new_RNA_white), "data.frame") adonis(phyloseq::distance(new_RNA_white, method="bray") ~ Ecosystem, data = metadata) # In all waters metadata <- as(sample_data(new_DNA), "data.frame") adonis(phyloseq::distance(new_DNA, method="bray") ~ Ecosystem, data = metadata) metadata <- as(sample_data(new_RNA), "data.frame") adonis(phyloseq::distance(new_RNA, method="bray") ~ Ecosystem, data = metadata) ###PCoA - Bray-Curtis distance ordu = ordinate(new_DNA, "PCoA", "bray") p = plot_ordination(new_DNA, ordu, color="Water_color", shape="Water_color") + geom_point(size=5, alpha=0.75) + scale_colour_brewer(type="qual", palette="Set1") + ggtitle("PCoA - BrayCurtis - new_DNA") + stat_ellipse(geom = "polygon", type="norm", alpha=0.04, aes(fill=Water_color)) + theme_bw() p ordu = ordinate(new_RNA, "PCoA", "bray") p = plot_ordination(new_RNA, ordu, color="Water_color", shape="Water_color") + geom_point(size=5, alpha=0.75) + scale_colour_brewer(type="qual", palette="Set1") + ggtitle("PCoA - BrayCurtis - new_RNA") + stat_ellipse(geom = "polygon", type="norm", alpha=0.04, aes(fill=Water_color)) + theme_bw() p ordu = ordinate(my_phyloseq_filt_no_single, "PCoA", "bray") p = plot_ordination(my_phyloseq_filt_no_single, ordu, color="Water_color", shape="Water_color") + geom_point(size=5, alpha=0.75) + scale_colour_brewer(type="qual", palette="Set1") + ggtitle("PCoA - BrayCurtis - my_phyloseq_filt_no_single") + stat_ellipse(geom = "polygon", type="norm", alpha=0.04, aes(fill=Water_color)) + theme_bw() p ###tests ADONIS metadata <- as(sample_data(new_DNA), "data.frame") adonis(phyloseq::distance(new_DNA, method="bray") ~ Water_color, data = metadata) metadata <- as(sample_data(new_RNA), "data.frame") adonis(phyloseq::distance(new_RNA, method="bray") ~ Water_color, data = metadata) metadata <- as(sample_data(my_phyloseq_filt_no_single), "data.frame") adonis(phyloseq::distance(my_phyloseq_filt_no_single, method="bray") ~ Water_color, data = metadata) ###NMDS - Bray-Curtis distance #ordinate new_DNA.nmds.bray <- ordinate(new_DNA.relative, method="NMDS", distance="bray") new_RNA.nmds.bray <- ordinate(new_RNA.relative, method="NMDS", distance="unifrac", weighted = T) my_phyloseq_filt_no_single.nmds.bray <- ordinate(my_phyloseq_filt_no_single.relative, method="NMDS", distance="bray") #plot ordination by site and water color plot_ordination(new_DNA.relative, new_DNA.nmds.bray, color="Site", shape = "Water_color", title="Bray NMDS by sample") plot_ordination(new_RNA.relative, new_RNA.nmds.bray, color="Site", shape = "Water_color", title="Bray NMDS by sample") plot_ordination(my_phyloseq_filt_no_single.relative, my_phyloseq_filt_no_single.nmds.bray, color="Site", shape = "Water_color", title="Bray NMDS by sample") #plot ordination by water color plot_ordination(DNA.relative, ordination_DNA.nmds.bray, color="Water_color", title="Bray NMDS by sample") + stat_ellipse() plot_ordination(DNA.relative, ordination_DNA.nmds.bray, color="Water_color", title="Bray NMDS by sample") + stat_ellipse() plot_ordination(DNA.relative, ordination_DNA.nmds.bray, color="Water_color", title="Bray NMDS by sample") + stat_ellipse() plot_ordination(new_RNA.relative, new_RNA.nmds.bray, color="Water_color", title="Bray NMDS by sample") + stat_ellipse(level = 0.90) plot_ordination(DNA.relative, ordination_DNA.nmds.bray, color="Water_color", title="Bray NMDS by sample") + stat_ellipse() ###Pairwise black versus white new_DNA_blackwhite = subset_samples(new_DNA, Water_color %in% c("Black_water", "White_water")) new_RNA_blackwhite = subset_samples(new_RNA, Water_color %in% c("Black_water", "White_water")) metadata <- as(sample_data(new_DNA_blackwhite), "data.frame") adonis(phyloseq::distance(new_DNA_blackwhite, method="bray") ~ Water_color, data = metadata) metadata <- as(sample_data(new_RNA_blackwhite), "data.frame") adonis(phyloseq::distance(new_RNA_blackwhite, method="bray") ~ Water_color, data = metadata) ### Network # Select the most abundant ASVs (to speed the correlation calculation) Abund_DNA = filter_taxa(new_DNA.relative, function(x) mean(x) > 0.0001, TRUE) ### Export bacteriopankton ASV/OTU tables and tax tables in .csv files Export_ASV_table_DNA <- as.data.frame(otu_table(Abund_DNA)) write.csv(Export_ASV_table_DNA, file="/Users/francois-etiennesylvain/Documents/Doctorat/new_ASV_table_abund_bacterioplankton_DNA.csv") Export_tax_table_DNA <- as.data.frame(tax_table(Abund_DNA)) write.csv(Export_tax_table_DNA, file="/Users/francois-etiennesylvain/Documents/Doctorat/new_Tax_table_abund_bacterioplankton_DNA.csv") Export_metadata_table_DNA <- as.data.frame(sample_data(Abund_DNA)) write.csv(Export_metadata_table_DNA, file="/Users/francois-etiennesylvain/Documents/Doctorat/new_Metadata_table_abund_bacterioplankton_DNA.csv") ### Compute Spearman correlations. DNA.network<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/new_Combined_ASV_table_abund_bacterioplankton_DNA.txt", header=TRUE, row.names=1) ## Correct the dataframe format (the number in [] is the number of columns (i.e. ASVs) in the ASV table) DNA.network=sapply(DNA.network[1:1884],as.numeric) # Calculate Spearman correlation values corr_DNA <- rcorr(DNA.network, type="spearman") # Extract, correct and filter p.values + correlations gene <- colnames(corr_DNA$P) Sample.comparison <- paste(gene[row(corr_DNA$P)], gene[col(corr_DNA$P)], sep=".") i <- lower.tri(corr_DNA$P) Dat <- data.frame(Sample.comparison[i], p.value=corr_DNA$P[i]) Dat$Bonferroni <- p.adjust(Dat$p.value, method="bonferroni") gene <- colnames(corr_DNA$r) Sample.comparison <- paste(gene[row(corr_DNA$r)], gene[col(corr_DNA$r)], sep=".") i <- lower.tri(corr_DNA$r) Correlation <- data.frame(Sample.comparison[i], correlation.spearman=corr_DNA$r[i], Dat$Bonferroni) Filt_Bonferroni <- subset(Correlation, Dat.Bonferroni < 0.05 & correlation.spearman > 0.6) write.csv(Filt_Bonferroni, file="/Users/francois-etiennesylvain/Documents/Doctorat/new_combined_p_values_Network_abund_DNA.csv") ### Make the list of functions found in Polynucleobacter sinensis from the total list of KEGG pathways possible ### + the KEGG pathways foudn in P. sinensis KEGG_ALL<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/KEGG_ALL.txt", header=TRUE) KEGG_POLY<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/KEGG_POLY.txt", header=TRUE) KEGG_ALL_unique = unique(KEGG_ALL, by = "KEGG_ID") KEGG_POLY_unique = unique(KEGG_POLY, by = "KEGG_POLY") POLY_list = subset(KEGG_ALL_unique, KEGG_ID %in% KEGG_POLY_unique$KEGG_POLY) write.csv(POLY_list, file="/Users/francois-etiennesylvain/Documents/Doctorat/POLY_list.csv") ### Now check which of the lignin degradation pathways are found in the overall dataset ALL_LIGNIN_DEGRAD<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/lignin_degradation_pathways.txt", header=TRUE) ALL_MY_PATHWAYS<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/DNARNA_KOtable_GENUS_wabundance.tsv", header=TRUE) MY_LIGNIN = subset(ALL_MY_PATHWAYS, KO_ID %in% ALL_LIGNIN_DEGRAD$KEGG_ID) write.csv(MY_LIGNIN, file="/Users/francois-etiennesylvain/Documents/Doctorat/MY_LIGNIN_PATHWAYS.csv") ### If we assume that pathways actively playing a role in HUMIC degradation should be correlated to humic abundance, ### let's make a list of these pathways... MY_LIGNIN_HUMIC_DNA<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/MY_LIGNIN_HUMIC_DNA.txt", header=TRUE, row.names=1) MY_LIGNIN_HUMIC_RNA<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/MY_LIGNIN_HUMIC_RNA.txt", header=TRUE, row.names=1) # we remove columns with a sum of zero (possible because DNA and RNA samples were computed together at first and then separed) MY_LIGNIN_HUMIC_DNA = MY_LIGNIN_HUMIC_DNA[, which(colSums(MY_LIGNIN_HUMIC_DNA) != 0)] MY_LIGNIN_HUMIC_RNA = MY_LIGNIN_HUMIC_RNA[, which(colSums(MY_LIGNIN_HUMIC_RNA) != 0)] # Correlations DNA samples MY_LIGNIN_HUMIC_DNA=sapply(MY_LIGNIN_HUMIC_DNA[1:481],as.numeric) corr_DNA <- rcorr(MY_LIGNIN_HUMIC_DNA, type="spearman") gene <- colnames(corr_DNA$P) Sample.comparison <- paste(gene[row(corr_DNA$P)], gene[col(corr_DNA$P)], sep=".") i <- lower.tri(corr_DNA$P) Dat <- data.frame(Sample.comparison[i], p.value=corr_DNA$P[i]) Dat$Bonferroni <- p.adjust(Dat$p.value, method="bonferroni") gene <- colnames(corr_DNA$r) Sample.comparison <- paste(gene[row(corr_DNA$r)], gene[col(corr_DNA$r)], sep=".") i <- lower.tri(corr_DNA$r) Correlation <- data.frame(Sample.comparison[i], correlation.spearman=corr_DNA$r[i], Dat$Bonferroni) Filt_Bonferroni <- subset(Correlation) Filt_Bonferroni<-Filt_Bonferroni[!(Filt_Bonferroni$correlation.spearman==1),] write.csv(Filt_Bonferroni, file="/Users/francois-etiennesylvain/Documents/Doctorat/combined_p_values_HUMIC_PATHWAYS_DNA.csv") # Correlations RNA samples MY_LIGNIN_HUMIC_RNA=sapply(MY_LIGNIN_HUMIC_RNA[1:580],as.numeric) corr_DNA <- rcorr(MY_LIGNIN_HUMIC_RNA, type="spearman") gene <- colnames(corr_DNA$P) Sample.comparison <- paste(gene[row(corr_DNA$P)], gene[col(corr_DNA$P)], sep=".") i <- lower.tri(corr_DNA$P) Dat <- data.frame(Sample.comparison[i], p.value=corr_DNA$P[i]) Dat$Bonferroni <- p.adjust(Dat$p.value, method="bonferroni") gene <- colnames(corr_DNA$r) Sample.comparison <- paste(gene[row(corr_DNA$r)], gene[col(corr_DNA$r)], sep=".") i <- lower.tri(corr_DNA$r) Correlation <- data.frame(Sample.comparison[i], correlation.spearman=corr_DNA$r[i], Dat$Bonferroni) Filt_Bonferroni <- subset(Correlation, Dat.Bonferroni < 0.05 & correlation.spearman > 0.6) write.csv(Filt_Bonferroni, file="/Users/francois-etiennesylvain/Documents/Doctorat/new_combined_p_values_HUMIC_PATHWAYS_RNA.csv") ### now we will illustrate these results via a heatmap #first we load the matrix LIGNIN_TAX_FUNC_CORR_DNA<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/LIGNIN_TAX_FUNC_CORR_DNA_MATRIX.txt", header=TRUE, row.names =1) LIGNIN_TAX_FUNC_CORR_DNA_mat <- as.matrix(as.data.frame(LIGNIN_TAX_FUNC_CORR_DNA)) #then we build the heatmap LIGNIN_TAX_FUNC_CORR_DNA_mat_t = t(LIGNIN_TAX_FUNC_CORR_DNA_mat) heatmap(LIGNIN_TAX_FUNC_CORR_DNA_mat, scale="none", Colv = NA, Rowv = NA, cexCol=1,cexRow=1.5, margins=c(11,4)) ### we want to understand how different water types affect the clustering of the microbiome: ### structure, expression profile, and functionnal profile. To do so we compute and plot NMDS analyses. #try to combine samples per site new_DNA_merged = merge_samples(new_DNA, group = "Site") new_RNA_merged = merge_samples(new_RNA, group = "Site") new_DNA_merged.relative <- transform_sample_counts(new_DNA_merged, function(otu) otu/sum(otu)) new_RNA_merged.relative <- transform_sample_counts(new_RNA_merged, function(otu) otu/sum(otu)) #si la convergence ne se fait pas, juste à refaire la commande jusqu'à ce qu'elle se fasse ordination_DNA.NMDS.unifrac <- ordinate(new_DNA_merged.relative, method="NMDS", distance="unifrac") ordination_RNA.NMDS.unifrac <- ordinate(new_RNA_merged.relative, method="NMDS", distance="unifrac") metadf_DNA_filt5 = meta(new_DNA_merged) metadata_fit_DNA = envfit(ordination_DNA.NMDS.unifrac, metadf_DNA_filt5[c("DOC","DOC_SAC340", "DOC_SUVA254","DOC_abs254.365","fulvic_like_DOC", "humic_like_DOC","protein_like_DOC","Na","Mg","K", "Ca","Cl","Nitrite","Nitrate","Silicate","Chl_a", "Pheopigments","Chla.DOC","Temperature","Conductivity", "pH", "Humic.tot", "fulvic.tot", "Al","V","Cr","Mn","Fe","Co", "Ni","Cu","Zn","As","Cd","Pb")]) metadf_RNA_filt5 = meta(new_RNA_merged) metadata_fit_RNA = envfit(ordination_RNA.NMDS.unifrac, metadf_RNA_filt5[c("DOC","DOC_SAC340", "DOC_SUVA254","DOC_abs254.365","fulvic_like_DOC", "humic_like_DOC","protein_like_DOC","Na","Mg","K", "Ca","Cl","Nitrite","Nitrate","Silicate","Chl_a", "Pheopigments","Chla.DOC","Temperature","Conductivity", "pH", "Humic.tot", "fulvic.tot", "Al","V","Cr","Mn","Fe","Co", "Ni","Cu","Zn","As","Cd","Pb")]) meta_new_DNA = meta(new_DNA_merged) meta_new_DNA$Water_color = c("Black","Clear", "Black", "White", "White", "Black", "White", "White", "Black", "White", "White", "Clear", "Clear", "Black", "White") #NMDS DNA 10 MOST IMPACTFUL PARAM plot(ordination_DNA.NMDS.unifrac, type="n", main="Unifrac NMDS DNA") #rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "gray98") #legend(-3,0, legend=c("Black_water","Clear_water","White_water"), col=c("gray25", "lightskyblue", "burlywood"), pch=20, yjust = 0, y.intersp = 0.15, x.intersp = 0.15, bty = "n") #ellipses ordiellipse(ordination_DNA.NMDS.unifrac, groups=meta_new_DNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="gray25", draw="polygon", alpha=150, show.groups = c("Black"), border="gray25", lwd = 1) ordiellipse(ordination_DNA.NMDS.unifrac, groups=meta_new_DNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="lightskyblue", draw="polygon", alpha=150, border = "lightskyblue", show.groups = c("Clear"), lwd = 1) ordiellipse(ordination_DNA.NMDS.unifrac, groups=meta_new_DNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="burlywood", draw="polygon", alpha=150, border = "burlywood", show.groups = c("White"), lwd = 1) points(ordination_DNA.NMDS.unifrac, pch=21, lwd=1, cex=2, display="sites", col = "black", bg=c("gray25", "lightskyblue", "burlywood")[factor(meta_new_DNA$Water_color)]) #ordiellipse(ordination_DNA.nmds.bray, groups=metadata_df_DNA$Water_color, display="sites", kind="se", conf=0.99, label=FALSE, col="gray", draw="polygon", alpha=100, show.groups = c("NA"), border=FALSE) #Add fitted variables metadf_DNA_filt5 = meta(new_DNA_merged) metadata_fit_DNA = envfit(ordination_DNA.NMDS.unifrac, metadf_DNA_filt5[c("fulvic_like_DOC","DOC","DOC_SAC340","humic_like_DOC","DOC_SUVA254","Chl_a","Chla.DOC","pH","Al","Pb")]) plot(metadata_fit_DNA, cex=0.0001, col="black") meta_new_RNA = meta(new_RNA_merged) meta_new_RNA$Water_color = c("Black","Clear", "Black", "White", "White", "Black", "White", "White", "Black", "White", "White", "Clear", "Clear", "Black", "White") #NMDS RNA 10 MOST IMPACTFUL PARAM plot(ordination_RNA.NMDS.unifrac, type="n", main="Unifrac NMDS RNA") #rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "gray98") #legend(-3,0, legend=c("Black_water","Clear_water","White_water"), col=c("gray25", "lightskyblue", "burlywood"), pch=20, yjust = 0, y.intersp = 0.15, x.intersp = 0.15, bty = "n") #ellipses ordiellipse(ordination_RNA.NMDS.unifrac, groups=meta_new_RNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="gray25", draw="polygon", alpha=150, show.groups = c("Black"), border="gray25", lwd = 1) ordiellipse(ordination_RNA.NMDS.unifrac, groups=meta_new_RNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="lightskyblue", draw="polygon", alpha=150, border = "lightskyblue", show.groups = c("Clear"), lwd = 1) ordiellipse(ordination_RNA.NMDS.unifrac, groups=meta_new_RNA$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="burlywood", draw="polygon", alpha=150, border = "burlywood", show.groups = c("White"), lwd = 1) points(ordination_RNA.NMDS.unifrac, pch=21, lwd=1, cex=2, display="sites", col = "black", bg=c("gray25", "lightskyblue", "burlywood")[factor(meta_new_RNA$Water_color)]) #ordiellipse(ordination_RNA.nmds.bray, groups=metadata_df_RNA$Water_color, display="sites", kind="se", conf=0.99, label=FALSE, col="gray", draw="polygon", alpha=100, show.groups = c("NA"), border=FALSE) #Add fitted variables metadf_RNA_filt5 = meta(new_RNA_merged) metadata_fit_RNA = envfit(ordination_RNA.NMDS.unifrac, metadf_RNA_filt5[c("pH","Cd","DOC","Ca","Chla.DOC","Conductivity","Al","Ni","Pb","K")]) plot(metadata_fit_RNA, cex=0.0001, col="black") #before computing NMDS for functions samples, we need to have a functions table that we can use for this type of analysis. So, we load a KEGG function #per genus table to begin with KEGG_ALL<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/KEGG_ALL.txt", header=TRUE) KEGG_ALL_unique = unique(KEGG_ALL, by = "KEGG_ID") function_genus <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/DNARNA_KOtable_GENUS.txt", header=TRUE) function_genus_true <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/DNARNA_KOtable_GENUS.txt", header=TRUE, row.names = 1, check.names = FALSE) metadata_function <- meta(my_phyloseq_filt_no_single) function_taxonomy = subset(KEGG_ALL_unique, KEGG_ID %in% function_genus$KEGG_ID) function_taxonomy = unique(function_taxonomy, by = "KEGG_ID") rownames(function_taxonomy) = make.names(function_taxonomy[,1], unique=TRUE) function_taxonomy[,1] <- NULL functions_phyloseq <- phyloseq(otu_table(function_genus_true, taxa_are_rows=TRUE), sample_data(metadata_function), tax_table(as.matrix(function_taxonomy))) ##separate DNA from RNA in the functions dataset DNA_functions = subset_samples(functions_phyloseq, TYPE %in% "DNA") #RNA_functions = subset_samples(functions_phyloseq, TYPE %in% "RNA") #we will use the DNA dataset in this project DNA_functions_merged = merge_samples(DNA_functions, group = "Site") #RNA_functions_merged = merge_samples(new_RNA, group = "Site") #we will use the DNA dataset in this project #solution reached! redo if solution is not reached: #ordination_functions.NMDS.unifrac <- ordinate(DNA_functions_merged, method="NMDS", distance="bray") ordination_functions.NMDS.unifrac <- ordinate(DNA_functions_merged, method="NMDS", distance="bray") metadf_functions = meta(DNA_functions_merged) metadata_fit_DNA = envfit(ordination_functions.NMDS.unifrac, metadf_functions[c("DOC","DOC_SAC340", "DOC_SUVA254","DOC_abs254.365","fulvic_like_DOC", "humic_like_DOC","protein_like_DOC","Na","Mg","K", "Ca","Cl","Nitrite","Nitrate","Silicate","Chl_a", "Pheopigments","Chla.DOC","Temperature","Conductivity", "pH", "Humic.tot", "fulvic.tot", "Al","V","Cr","Mn","Fe","Co", "Ni","Cu","Zn","As","Cd","Pb")]) metadf_functions$Water_color = c("Black","Clear", "Black", "White", "White", "Black", "White", "White", "Black", "White", "White", "Clear", "Clear", "Black", "White") #NMDS FUNCTIONS 10 MOST IMPACTFUL PARAM plot(ordination_functions.NMDS.unifrac$points, type="n", main="Bray NMDS functions", xlim = c(-0.5,0.35), ylim = c(-0.20,0.20)) #rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "gray98") #legend(-3,0, legend=c("Black_water","Clear_water","White_water"), col=c("gray25", "lightskyblue", "burlywood"), pch=20, yjust = 0, y.intersp = 0.15, x.intersp = 0.15, bty = "n") #ellipses ordiellipse(ordination_functions.NMDS.unifrac, groups=metadf_functions$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="gray25", draw="polygon", alpha=150, show.groups = c("Black"), border="gray25", lwd = 1) ordiellipse(ordination_functions.NMDS.unifrac, groups=metadf_functions$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="lightskyblue", draw="polygon", alpha=150, border = "lightskyblue", show.groups = c("Clear"), lwd = 1) ordiellipse(ordination_functions.NMDS.unifrac, groups=metadf_functions$Water_color, display="sites", kind="se", conf=0.95, label=FALSE, col="burlywood", draw="polygon", alpha=150, border = "burlywood", show.groups = c("White"), lwd = 1) points(ordination_functions.NMDS.unifrac, pch=21, lwd=1, cex=2, display="sites", col = "black", bg=c("gray25", "lightskyblue", "burlywood")[factor(metadf_functions$Water_color)]) #ordiellipse(ordination_DNA.nmds.bray, groups=metadata_df_DNA$Water_color, display="sites", kind="se", conf=0.99, label=FALSE, col="gray", draw="polygon", alpha=100, show.groups = c("NA"), border=FALSE) #Add fitted variables metadata_fit_DNA = envfit(ordination_functions.NMDS.unifrac, metadf_functions[c("DOC_SAC340", "fulvic_like_DOC", "humic_like_DOC", "Mg", "K", "Ca", "Silicate", "Conductivity", "Al", "Pb")]) plot(metadata_fit_DNA, cex=0.0001, col="black") #TESTS ADONIS FOR NMDS PLOTS metadata <- as(sample_data(DNA_functions), "data.frame") adonis(phyloseq::distance(DNA_functions, method="bray") ~ Water_color, data = metadata) metadata <- as(sample_data(new_DNA), "data.frame") adonis(phyloseq::distance(new_DNA, method="bray") ~ Water_color, data = metadata) metadata <- as(sample_data(new_RNA), "data.frame") adonis(phyloseq::distance(new_RNA, method="bray") ~ Water_color, data = metadata) ### RANDOM FOREST TO DETECT ASVS ASSOCIATED TO BLACK WATER library(randomForest) #Format data for random forest: black versus others DNA1 new_DNA1 = new_DNA sample_data(new_DNA1)$Water_color <- factor(sample_data(new_DNA1)$Water_color, levels = list(Black_water = "Black_water", White_water = c("White_water", "Clear_water"))) sample_data(new_DNA1)$Water_color <- as.character(sample_data(new_DNA1)$Water_color) sample_data(new_DNA1)$Water_color[is.na(sample_data(new_DNA1)$Water_color)] <- "White_or_Clear" ## Implement Random forest algorithm DNA1 #Prepare data for RF ntaxa(new_DNA1) prunescale = 0.001 minlib = 5000 tax.mean = taxa_sums(new_DNA1)/nsamples(new_DNA1) all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, new_DNA1) ntaxa(all_muc_prune) predictors = t(otu_table(all_muc_prune)) dim(predictors) response <- as.factor(sample_data(all_muc_prune)$Water_color) rf.data <- data.frame(response, predictors) #Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression. set.seed(2) all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500) print(all_muc.classify) names(all_muc.classify) #Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables. imp <- importance(all_muc.classify) imp <- data.frame(predictors = rownames(imp), imp) imp.sort <- arrange(imp, desc(MeanDecreaseGini)) imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors) imp.50 <- imp.sort[1:40, ] write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_DNA1.txt") otunames <- imp.50$predictors r <- rownames(tax_table(new_DNA1)) %in% otunames purif_otus = tax_table(new_DNA1)[r, ] write.table(tax_table(new_DNA1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_DNA1.txt") RF_DNA1 = subset_taxa(new_DNA1, row.names(tax_table(new_DNA1)) %in% row.names(purif_otus)) #Format data for random forest: black versus others RNA1 new_RNA1 = new_RNA sample_data(new_RNA1)$Water_color <- factor(sample_data(new_RNA1)$Water_color, levels = list(Black_water = "Black_water", White_water = c("White_water", "Clear_water"))) sample_data(new_RNA1)$Water_color <- as.character(sample_data(new_RNA1)$Water_color) sample_data(new_RNA1)$Water_color[is.na(sample_data(new_RNA1)$Water_color)] <- "White_or_Clear" ## Implement Random forest algorithm RNA1 #Prepare data for RF ntaxa(new_RNA1) prunescale = 0.001 minlib = 5000 tax.mean = taxa_sums(new_RNA1)/nsamples(new_RNA1) all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, new_RNA1) ntaxa(all_muc_prune) predictors = t(otu_table(all_muc_prune)) dim(predictors) response <- as.factor(sample_data(all_muc_prune)$Water_color) rf.data <- data.frame(response, predictors) #Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression. set.seed(2) all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500) print(all_muc.classify) names(all_muc.classify) #Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables. imp <- importance(all_muc.classify) imp <- data.frame(predictors = rownames(imp), imp) imp.sort <- arrange(imp, desc(MeanDecreaseGini)) imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors) imp.50 <- imp.sort[1:40, ] write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_RNA1.txt") otunames <- imp.50$predictors r <- rownames(tax_table(new_RNA1)) %in% otunames purif_otus = tax_table(new_RNA1)[r, ] write.table(tax_table(new_RNA1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_RNA1.txt") RF_RNA1 = subset_taxa(new_RNA1, row.names(tax_table(new_RNA1)) %in% row.names(purif_otus)) #Format data for random forest: black versus others DETAILED FUNCTIONS DNA_functions1 = DNA_functions sample_data(DNA_functions1)$Water_color <- factor(sample_data(DNA_functions1)$Water_color, levels = list(Black_water = "Black_water", White_water = c("White_water", "Clear_water"))) sample_data(DNA_functions1)$Water_color <- as.character(sample_data(DNA_functions1)$Water_color) sample_data(DNA_functions1)$Water_color[is.na(sample_data(DNA_functions1)$Water_color)] <- "White_or_Clear" ## Implement Random forest algorithm DNA1 #Prepare data for RF ntaxa(DNA_functions1) prunescale = 0.001 minlib = 5000 tax.mean = taxa_sums(DNA_functions1)/nsamples(DNA_functions1) all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, DNA_functions1) ntaxa(all_muc_prune) predictors = t(otu_table(all_muc_prune)) dim(predictors) response <- as.factor(sample_data(all_muc_prune)$Water_color) rf.data <- data.frame(response, predictors) #Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression. set.seed(2) all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500) print(all_muc.classify) names(all_muc.classify) #Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables. imp <- importance(all_muc.classify) imp <- data.frame(predictors = rownames(imp), imp) imp.sort <- arrange(imp, desc(MeanDecreaseGini)) imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors) imp.50 <- imp.sort[1:40, ] write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_DNA_functions.txt") otunames <- imp.50$predictors r <- rownames(tax_table(DNA_functions1)) %in% otunames purif_otus = tax_table(DNA_functions1)[r, ] write.table(tax_table(DNA_functions1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_DNA_functions.txt") RF_functions1 = subset_taxa(DNA_functions1, row.names(tax_table(DNA_functions1)) %in% row.names(purif_otus)) ###Format data for random forest: black versus others LARGE SCALE FUNCTIONS function_genus_true <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/large_scale_functions_DNA2.txt", header=TRUE, row.names = 1, check.names = FALSE) Export_Nsum_OTU_table_all_mucus <- as.data.frame(sample_data(new_DNA)) write.table(Export_Nsum_OTU_table_all_mucus, file="/Users/francois-etiennesylvain/Documents/Doctorat/meta_new_DNA.txt") write.table(row.names(function_genus_true), file="/Users/francois-etiennesylvain/Documents/Doctorat/taxa_new_DNA.txt") function_genus_true <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/large_scale_functions_DNA2.txt", header=TRUE, row.names = 1, check.names = FALSE) metadata_function <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/meta_new_DNA.txt", header=TRUE, row.names = 1, check.names = FALSE) function_taxonomy = read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/taxa_new_DNA.txt", header=TRUE, row.names = 1, check.names = FALSE) rownames(function_genus_true)=rownames(function_taxonomy) large_functions_phyloseq <- phyloseq(otu_table(as.matrix(function_genus_true), taxa_are_rows=TRUE), sample_data(metadata_function), tax_table(as.matrix(function_taxonomy))) large_functions_phyloseq1 = large_functions_phyloseq sample_data(large_functions_phyloseq1)$Water_color <- factor(sample_data(large_functions_phyloseq1)$Water_color, levels = list(Black_water = "Black_water", White_water = c("White_water", "Clear_water"))) sample_data(large_functions_phyloseq1)$Water_color <- as.character(sample_data(large_functions_phyloseq1)$Water_color) sample_data(large_functions_phyloseq1)$Water_color[is.na(sample_data(large_functions_phyloseq1)$Water_color)] <- "White_or_Clear" #compute random forest analysis on large scale functions dataset ntaxa(large_functions_phyloseq1) prunescale = 0.001 minlib = 5000 tax.mean = taxa_sums(large_functions_phyloseq1)/nsamples(large_functions_phyloseq1) all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, large_functions_phyloseq1) ntaxa(all_muc_prune) predictors = t(otu_table(all_muc_prune)) dim(predictors) response <- as.factor(sample_data(all_muc_prune)$Water_color) rf.data <- data.frame(response, predictors) #Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression. set.seed(2) all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500) print(all_muc.classify) names(all_muc.classify) #Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables. imp <- importance(all_muc.classify) imp <- data.frame(predictors = rownames(imp), imp) imp.sort <- arrange(imp, desc(MeanDecreaseGini)) imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors) imp.50 <- imp.sort[1:40, ] write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_DNA_functions_large_scale.txt") otunames <- imp.50$predictors #r <- rownames(tax_table(large_functions_phyloseq1)) %in% otunames #purif_otus = tax_table(large_functions_phyloseq1)[r, ] #write.table(tax_table(large_functions_phyloseq1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_DNA_functions_large_scale.txt") #RF_DNA1 = subset_taxa(large_functions_phyloseq1, row.names(tax_table(large_functions_phyloseq1)) %in% row.names(purif_otus)) ### faire le tri car plusieurs fonctions appartiennent à l'humain, aller les sélectionner manuellement pour faire une table d'ASVs avec # extract the otu tables from phyloseq otu_table_RF_DNA1 <- as.data.frame(otu_table(RF_DNA1)) write.table(otu_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_DNA.txt") tax_table_RF_DNA1 <- as.data.frame(tax_table(RF_DNA1)) write.table(tax_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_DNA.txt") meta_table_RF_DNA1 <- as.data.frame(sample_data(RF_DNA1)) write.table(meta_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_DNA.txt") otu_table_RF_RNA1 <- as.data.frame(otu_table(RF_RNA1)) write.table(otu_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_RNA.txt") tax_table_RF_RNA1 <- as.data.frame(tax_table(RF_RNA1)) write.table(tax_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_RNA.txt") meta_table_RF_RNA1 <- as.data.frame(sample_data(RF_RNA1)) write.table(meta_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_RNA.txt") otu_table_RF_functions <- as.data.frame(otu_table(RF_functions1)) write.table(otu_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_functions.txt") tax_table_RF_functions <- as.data.frame(tax_table(RF_functions1)) write.table(tax_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_functions.txt") meta_table_RF_functions <- as.data.frame(sample_data(RF_functions1)) write.table(meta_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_functions.txt") # I reformatted manually the OTU tables to include info about water type + ASV tax + to change site 2 from black to clear water # build heatmaps library(RColorBrewer) coul <- colorRampPalette(brewer.pal(8, "Greys"))(25) combined_RF_DNA <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_combined_otu_tax_DNA.txt", header=TRUE, row.names = 1, check.names = FALSE) combined_RF_DNA_mat = as.matrix(t(combined_RF_DNA)) heatmap(combined_RF_DNA_mat, scale="row", Colv = NA, cexCol=0.5,cexRow=0.5, margins=c(11,11), col = coul) combined_RF_RNA <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/combined_RF_otu_RNA.txt", header=TRUE, row.names = 1, check.names = FALSE) combined_RF_RNA_mat = as.matrix(t(combined_RF_RNA)) heatmap(combined_RF_RNA_mat, scale="row", Colv = NA, cexCol=0.5,cexRow=0.5, margins=c(11,11), col = coul) combined_RF_functions <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/combined_RF_otu_functions.txt", header=TRUE, row.names = 1, check.names = FALSE) combined_RF_functions_mat = as.matrix(t(combined_RF_functions)) heatmap(combined_RF_functions_mat, scale="row", Colv = NA, cexCol=0.5,cexRow=0.5, margins=c(11,11), col = coul) ### Now we want to check of DOC is a major driver of community structure / transcription ### Corr. to functionnal profile is done in the heatmap/lignin cycle analysis ### first we compute ordisurfs #ordisurf DNA meta_new_DNA = meta(new_DNA_merged) meta_new_DNA$Water_color = c("Black","Clear", "Black", "White", "White", "Black", "White", "White", "Black", "White", "White", "Clear", "Clear", "Black", "White") groups <- meta_new_DNA$Water_color #get grouping information from meta data df=data.frame(x=ordination_DNA.NMDS.unifrac$point[,1],y=ordination_DNA.NMDS.unifrac$point[,2],Groups=groups) #Add a dummy variable corrresponding to the selected variable meta_new_DNA$var <- meta_new_DNA$Humic.tot # FES: p-e son format a lui est meilleur #fit a surface for a selected variable onto ordination stats ordi<- vegan::ordisurf(ordination_DNA.NMDS.unifrac,meta_new_DNA$var ,plot = FALSE, bs="ds") ordi.grid <- ordi$grid #extracts the ordisurf object #str(ordi.grid) #it's a list though - cannot be plotted as is ordi.mite <- expand.grid(x = ordi.grid$x, y = ordi.grid$y) #get x and ys ordi.mite$z <- as.vector(ordi.grid$z) #unravel the matrix for the z scores ordi.mite.na <- data.frame(na.omit(ordi.mite)) #gets rid of the nas #make the plot p<-ggplot2::ggplot()+stat_contour(data = ordi.mite.na, aes(x = x, y = y, z = z, colour = ..level..), size = 1.8, positon="identity") #can change the binwidth depending on how many contours you want p<-p+ ggplot2::geom_point(data=df,aes(x,y,fill=Groups),pch=21,size=12) p<-p+ ggplot2::scale_fill_manual(values = c("gray25", "lightskyblue", "burlywood")) p<-p+ ggplot2::scale_colour_continuous(high = "red4", low = "yellow2") #here we set the high and low of the colour scale. Can delete to go back to the standard blue, or specify others #p<-p+ ggplot2::labs(colour = paste(env.variable)) #another way to set the labels, in this case, for the colour legend p<-p+ ggplot2::theme_bw() p # RNA meta_new_RNA = meta(new_RNA_merged) meta_new_RNA$Water_color = c("Black","Clear", "Black", "White", "White", "Black", "White", "White", "Black", "White", "White", "Clear", "Clear", "Black", "White") groups <- meta_new_RNA$Water_color #get grouping information from meta data df=data.frame(x=ordination_RNA.NMDS.unifrac$point[,1],y=ordination_RNA.NMDS.unifrac$point[,2],Groups=groups) #Add a dummy variable corrresponding to the selected variable meta_new_RNA$var <- meta_new_RNA$humic_like_DOC # FES: p-e son format a lui est meilleur #fit a surface for a selected variable onto ordination stats ordi<- vegan::ordisurf(ordination_RNA.NMDS.unifrac,meta_new_RNA$var ,plot = FALSE, bs="ds") ordi.grid <- ordi$grid #extracts the ordisurf object #str(ordi.grid) #it's a list though - cannot be plotted as is ordi.mite <- expand.grid(x = ordi.grid$x, y = ordi.grid$y) #get x and ys ordi.mite$z <- as.vector(ordi.grid$z) #unravel the matrix for the z scores ordi.mite.na <- data.frame(na.omit(ordi.mite)) #gets rid of the nas #make the plot p<-ggplot2::ggplot()+stat_contour(data = ordi.mite.na, aes(x = x, y = y, z = z, colour = ..level..), size = 1.8, positon="identity") #can change the binwidth depending on how many contours you want p<-p+ ggplot2::geom_point(data=df,aes(x,y,fill=Groups),pch=21,size=12) p<-p+ ggplot2::scale_fill_manual(values = c("gray25", "lightskyblue", "burlywood")) p<-p+ ggplot2::scale_colour_continuous(high = "red4", low = "yellow2") #here we set the high and low of the colour scale. Can delete to go back to the standard blue, or specify others #p<-p+ ggplot2::labs(colour = paste(env.variable)) #another way to set the labels, in this case, for the colour legend p<-p+ ggplot2::theme_bw() p # Normalize data DNA.relative = transform_sample_counts(new_DNA, function(x) x / sum(x) ) RNA.relative = transform_sample_counts(new_RNA, function(x) x / sum(x) ) ### Select the most abundant ASVs (to speed the correlation calculation) Abund_DNA = filter_taxa(DNA.relative, function(x) mean(x) > 0.00075, TRUE) Abund_RNA = filter_taxa(RNA.relative, function(x) mean(x) > 0.00025, TRUE) ### Export bacteriopankton ASV/OTU tables and tax tables in .csv files Export_ASV_table_DNA <- as.data.frame(otu_table(Abund_DNA)) write.csv(Export_ASV_table_DNA, file="ASV_table_abund_bacterioplankton_DNA_MARCH2021.csv") Export_tax_table_DNA <- as.data.frame(tax_table(Abund_DNA)) write.csv(Export_tax_table_DNA, file="Tax_table_abund_bacterioplankton_DNA_MARCH2021.csv") Export_metadata_table_DNA <- as.data.frame(sample_data(Abund_DNA)) write.csv(Export_metadata_table_DNA, file="Metadata_table_abund_bacterioplankton_DNA_MARCH2021.csv") Export_ASV_table_RNA <- as.data.frame(otu_table(Abund_RNA)) write.csv(Export_ASV_table_RNA, file="ASV_table_abund_bacterioplankton_RNA_MARCH2021.csv") Export_tax_table_RNA <- as.data.frame(tax_table(Abund_RNA)) write.csv(Export_tax_table_RNA, file="Tax_table_abund_bacterioplankton_RNA_MARCH2021.csv") Export_metadata_table_RNA <- as.data.frame(sample_data(Abund_RNA)) write.csv(Export_metadata_table_RNA, file="Metadata_table_abund_bacterioplankton_RNA_MARCH2021.csv") ### Compute Spearman correlations. DNA.network<-read.table(file="Combined_ASV_table_abund_bacterioplankton_DNA_MARCH2021.txt", header=TRUE, row.names=1) RNA.network<-read.table(file="combined_ASV_table_abund_bacterioplankton_RNA_MARCH2021.txt", header=TRUE, row.names=1) ## Correct the dataframe format (the number in [] is the number of columns (i.e. ASVs) in the ASV table) DNA.network=sapply(DNA.network[1:133],as.numeric) RNA.network=sapply(RNA.network[1:646],as.numeric) # Calculate Spearman correlation values corr_DNA <- rcorr(DNA.network, type="spearman") corr_RNA <- rcorr(RNA.network, type="spearman") # Extract, correct and filter p.values + correlations gene <- colnames(corr_DNA$P) Sample.comparison <- paste(gene[row(corr_DNA$P)], gene[col(corr_DNA$P)], sep=".") i <- lower.tri(corr_DNA$P) Dat <- data.frame(Sample.comparison[i], p.value=corr_DNA$P[i]) Dat$Bonferroni <- p.adjust(Dat$p.value, method="bonferroni") gene <- colnames(corr_DNA$r) Sample.comparison <- paste(gene[row(corr_DNA$r)], gene[col(corr_DNA$r)], sep=".") i <- lower.tri(corr_DNA$r) Correlation <- data.frame(Sample.comparison[i], correlation.spearman=corr_DNA$r[i], Dat$Bonferroni) Filt_Bonferroni <- subset(Correlation, Dat.Bonferroni < 0.05 & correlation.spearman > 0.5) write.csv(Filt_Bonferroni, file="combined_p_values_Network_abund_DNA_MARCH2021.csv") gene <- colnames(corr_RNA$P) Sample.comparison <- paste(gene[row(corr_RNA$P)], gene[col(corr_RNA$P)], sep=".") i <- lower.tri(corr_RNA$P) Dat <- data.frame(Sample.comparison[i], p.value=corr_RNA$P[i]) Dat$Bonferroni <- p.adjust(Dat$p.value, method="bonferroni") gene <- colnames(corr_RNA$r) Sample.comparison <- paste(gene[row(corr_RNA$r)], gene[col(corr_RNA$r)], sep=".") i <- lower.tri(corr_RNA$r) Correlation <- data.frame(Sample.comparison[i], correlation.spearman=corr_RNA$r[i], Dat$Bonferroni) Filt_Bonferroni <- subset(Correlation, Dat.Bonferroni < 0.05 & correlation.spearman > 0.5) write.csv(Filt_Bonferroni, file="combined_p_values_Network_abund_RNA_MARCH2021.csv") ####RAREFACTION ANALYSIS ### Modified script from https://github.com/joey711/phyloseq/issues/143 psdata <- new_RNA #set.seed(42) calculate_rarefaction_curves <- function(psdata, measures, depths) { require('plyr') # ldply require('reshape2') # melt estimate_rarified_richness <- function(psdata, measures, depth) { if(max(sample_sums(psdata)) < depth) return() psdata <- prune_samples(sample_sums(psdata) >= depth, psdata) rarified_psdata <- rarefy_even_depth(psdata, depth, verbose = FALSE) alpha_diversity <- estimate_richness(rarified_psdata, measures = measures) # as.matrix forces the use of melt.array, which includes the Sample names (rownames) molten_alpha_diversity <- melt(as.matrix(alpha_diversity), varnames = c('Sample', 'Measure'), value.name = 'Alpha_diversity') molten_alpha_diversity } names(depths) <- depths # this enables automatic addition of the Depth to the output by ldply rarefaction_curve_data <- ldply(depths, estimate_rarified_richness, psdata = psdata, measures = measures, .id = 'Depth', .progress = ifelse(interactive(), 'text', 'none')) # convert Depth from factor to numeric rarefaction_curve_data$Depth <- as.numeric(levels(rarefaction_curve_data$Depth))[rarefaction_curve_data$Depth] rarefaction_curve_data } #rarefaction_curve_data <- calculate_rarefaction_curves(psdata, c('Observed', 'Shannon'), rep(c(1, 10, 100, 1000, 1:100 * 10000), each = 10)) rarefaction_curve_data <- calculate_rarefaction_curves(psdata, 'Shannon', rep(c(1, 10, 100, 1000, 1:100 * 10000), each = 10)) summary(rarefaction_curve_data) rarefaction_curve_data_summary <- ddply(rarefaction_curve_data, c('Depth', 'Sample'), summarise, Alpha_diversity_mean = mean(Alpha_diversity), Alpha_diversity_sd = sd(Alpha_diversity)) rarefaction_curve_data_summary$Sample <- gsub("X", "", rarefaction_curve_data_summary$Sample) data1 = data.frame(sample_data(psdata)) row.names(data1) <- gsub('-', '-', row.names(data1)) data1 rarefaction_curve_data_summary_verbose <- merge(rarefaction_curve_data_summary, data1, by.x = 'Sample', by.y = 'row.names') g <- ggplot( data = rarefaction_curve_data_summary_verbose, mapping = aes( x = Depth, y = Alpha_diversity_mean, ymin = Alpha_diversity_mean - Alpha_diversity_sd, ymax = Alpha_diversity_mean + Alpha_diversity_sd, colour = Site, group = Sample ) ) + geom_line( ) + geom_pointrange( ) + facet_wrap( facets = ~Site, scales = 'free_y' ) ggsave("/Users/francois-etiennesylvain/Documents/Doctorat/rarefaction_plot_bacterioplankton_RNA.jpg", plot = g, width = 24, height = 24, units = "in") #### Check whether we can find lignin degradation pathways in the fungi of the metagenomic database ALL_LIGNIN_DEGRAD<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/lignin_degradation_pathways.txt", header=TRUE) FUNGI_PATHWAYS<-read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/AMZN_TAX_GO_KO_FUNGI.txt", header=TRUE) MY_LIGNIN_FUNGI = subset(FUNGI_PATHWAYS, KO_ID %in% ALL_LIGNIN_DEGRAD$KEGG_ID) write.csv(MY_LIGNIN_FUNGI, file="/Users/francois-etiennesylvain/Documents/Doctorat/FUNGI_LIGNIN_PATHWAYS.csv") #### Compute RDA instead of NMDS = more accurate + we can control for VIF (variance inflation factors) # Source = http://www.hiercourse.com/docs/microbial/04_betaDiversity_multTables.html # RDA with the DNA data # extract the OTU table Y <- veganotu(new_DNA.relative) # extract the sample data from the 'phyloseq' object then remove the 'SampleID' column dat <- data.frame(sample_data(new_DNA.relative)) #dat <- select(dat, -SampleID) # select the continuous variables to use in the constraint (Y) then standardise X <- select_if(dat, is.numeric) X <- decostand(X, method='standardize') # RDA #res <- rda(Y ~ ., data=X) #rda_plot = plot(res, xlab=ord_labels(res)[1], ylab=ord_labels(res)[2]) # also test dbRDA # note the result is the same as for RDA when using euclidean distances, so we will use bray here res2 <- capscale(Y ~ ., data=X, distance='bray') dbrda_plot = plot(res, xlab=ord_labels(res2)[1], ylab=ord_labels(res2)[2]) # permutation test of statistical significance anova(res) # calculate variance inflation variables with scores >10 are redundant sort(vif.cca(res2)) # calculate fit envfit(Y ~ ., data=X) # set up full and null models for 'ordistep' full model rda1 <- rda(Y ~ ., data=X) # intercept-only (null) model rda0 <- rda(Y ~ 1, data=X) # perform forward and backward selection of explanatory variables (output not shown) step.env <- ordistep(rda0, scope=formula(rda1), direction='both') ###REALLY LONG TO COMPUTE #### # look at the significant variables step.env$anova # code to get variable names from 'ordistep' and 'envfit' results vars.ordistep <- gsub('^. ', '', rownames(step.env$anova)) vars.envfit <- names(which(vif.cca(res) <= 10)) vars <- unique(c(vars.ordistep, vars.envfit)) # select variables to keep from table 'Y' X1 <- X[, vars] str(X1) # RDA with fewer variables (selected variables) res <- rda(Y ~ ., data=X1) # summary of the results summary(res, display=NULL) # permutation test of statistical significance anova(res) # load library library(gridExtra) # set up dataframes for plotting the results sit <- cbind(dat, scores(res, display='sites')) spp <- cbind(data.frame(tax_table(new_DNA.relative)), scores(res, display='species')) vec <- data.frame(scores(res, display='bp')) # use these to adjust length of arrows and position of arrow labels adj.vec <- 0.30 # 0.4 adj.txt <- 0.33 # 0.45 p1 <- ggplot(sit, aes(x=RDA1, y=RDA2, color=Water_color, shape=Water_color)) + scale_colour_manual(values = c("Black_water" = "gray25", "Clear_water" = "lightskyblue", "White_water" = "burlywood"))+ geom_point(size=3) + geom_segment(data=vec, inherit.aes=F, alpha = 0.5, mapping=aes(x=0, y=0, xend=adj.vec*RDA1, yend=adj.vec*RDA2), arrow=arrow(length=unit(0.2, 'cm'))) + geom_text(data=vec, inherit.aes=F, cex=0.001, mapping=aes(x=adj.txt*RDA1, y=adj.txt*RDA2, label=c('Humic_DOC', 'DOC_Abs254/365', 'pH', 'Co', 'Zn', 'Cd', 'Mg', 'Total_DOC', 'Fulvic_DOC', 'DOC_SAC340', 'Conductivity', 'NO3', 'Na'))) + theme_bw() p1$labels$x <- ord_labels(res)[1] p1$labels$y <- ord_labels(res)[2] p1 ###### RDA with RNA data # extract the OTU table Y2 <- veganotu(new_RNA.relative) # extract the sample data from the 'phyloseq' object then remove the 'SampleID' column dat2 <- data.frame(sample_data(new_RNA.relative)) #dat <- select(dat, -SampleID) # select the continuous variables to use in the constraint (Y) then standardise X2 <- select_if(dat2, is.numeric) X2 <- decostand(X2, method='standardize') # RDA #res2 <- rda(Y2 ~ ., data=X2) #rda_plot2 = plot(res2, xlab=ord_labels(res2)[1], ylab=ord_labels(res2)[2]) # also test dbRDA # note the result is the same as for RDA when using euclidean distances, so we will use bray here res3 <- capscale(Y2 ~ ., data=X2, distance='bray') dbrda_plot = plot(res3, xlab=ord_labels(res3)[1], ylab=ord_labels(res3)[2]) # permutation test of statistical significance anova(res3) # calculate variance inflation variables with scores >10 are redundant sort(vif.cca(res3)) # calculate fit envfit(Y2 ~ ., data=X2) # set up full and null models for 'ordistep' full model rda12 <- rda(Y2 ~ ., data=X2) # intercept-only (null) model rda02 <- rda(Y2 ~ 1, data=X2) # perform forward and backward selection of explanatory variables (output not shown) ##step.env2 <- ordistep(rda02, scope=formula(rda12), direction='both') ###REALLY LONG TO COMPUTE #### # look at the significant variables step.env2$anova # code to get variable names from 'ordistep' and 'envfit' results vars.ordistep2 <- gsub('^. ', '', rownames(step.env2$anova)) vars.envfit2 <- names(which(vif.cca(res3) <= 10)) vars2 <- unique(c(vars.ordistep2, vars.envfit2)) # select variables to keep from table 'Y' X12 <- X2[, vars2] str(X12) # RDA with fewer variables (selected variables) res3 <- capscale(Y2 ~ ., data=X12, distance='bray') # summary of the results summary(res3, display=NULL) # permutation test of statistical significance anova(res3) # set up dataframes for plotting the results sit2 <- cbind(dat2, scores(res3, display='sites')) spp2 <- cbind(data.frame(tax_table(new_RNA.relative)), scores(res3, display='species')) vec2 <- data.frame(scores(res3, display='bp')) # use these to adjust length of arrows and position of arrow labels adj.vec <- 3.0 adj.txt <- 3.3 p2 <- ggplot(sit2, aes(x=CAP1, y=CAP2, color=Water_color, shape=Water_color)) + scale_colour_manual(values = c("Black_water" = "gray25", "Clear_water" = "lightskyblue", "White_water" = "burlywood"))+ geom_point(size=3) + geom_segment(data=vec2, inherit.aes=F, alpha = 0.5, mapping=aes(x=0, y=0, xend=adj.vec*CAP1, yend=adj.vec*CAP2), arrow=arrow(length=unit(0.2, 'cm'))) + geom_text(data=vec2, inherit.aes=F, cex=0.00001, mapping=aes(x=adj.txt*CAP1, y=adj.txt*CAP2, label=c('Ca', 'DOC_Abs254/365', 'K', 'Fulvic_DOC', 'Mg', 'Protein_DOC', 'Co', 'Cd', 'Pb' ))) + theme_bw() p2$labels$x <- ord_labels(res3)[1] p2$labels$y <- ord_labels(res3)[2] p2 ##### RDA with FUNCTIONS data DNA_functions.relative <- transform_sample_counts(DNA_functions, function(otu) otu/sum(otu)) # extract the OTU table Y4 <- veganotu(DNA_functions.relative) # extract the sample data from the 'phyloseq' object then remove the 'SampleID' column dat4 <- data.frame(sample_data(DNA_functions.relative)) #dat <- select(dat, -SampleID) # select the continuous variables to use in the constraint (Y) then standardise X4 <- select_if(dat4, is.numeric) X4 <- decostand(X4, method='standardize') # RDA #res4 <- rda(Y4 ~ ., data=X4) #rda_plot4 = plot(res4, xlab=ord_labels(res4)[1], ylab=ord_labels(res4)[2]) # also test dbRDA # note the result is the same as for RDA when using euclidean distances, so we will use bray here res4 <- capscale(Y4 ~ ., data=X4, distance='bray') dbrda_plot = plot(res4, xlab=ord_labels(res4)[1], ylab=ord_labels(res4)[2]) # permutation test of statistical significance anova(res4) # calculate variance inflation variables with scores >10 are redundant sort(vif.cca(res4)) # calculate fit envfit(Y4 ~ ., data=X4) # set up full and null models for 'ordistep' full model rda14 <- rda(Y4 ~ ., data=X4) # intercept-only (null) model rda04 <- rda(Y4 ~ 1, data=X4) # perform forward and backward selection of explanatory variables (output not shown) step.env4 <- ordistep(rda04, scope=formula(rda14), direction='both') ###REALLY LONG TO COMPUTE #### # look at the significant variables step.env4$anova # code to get variable names from 'ordistep' and 'envfit' results vars.ordistep4 <- gsub('^. ', '', rownames(step.env4$anova)) vars.envfit4 <- names(which(vif.cca(res4) <= 10)) vars4 <- unique(c(vars.ordistep4, vars.envfit4)) # select variables to keep from table 'Y' X14 <- X4[, vars4] str(X14) # RDA with fewer variables (selected variables) res4 <- capscale(Y4 ~ ., data=X14, distance='bray') # summary of the results summary(res4, display=NULL) # permutation test of statistical significance anova(res4) # set up dataframes for plotting the results sit4 <- cbind(dat4, scores(res4, display='sites')) spp4 <- cbind(data.frame(tax_table(DNA_functions.relative)), scores(res4, display='species')) vec4 <- data.frame(scores(res4, display='bp')) # use these to adjust length of arrows and position of arrow labels adj.vec <- 2.0 adj.txt <- 2.2 p2 <- ggplot(sit4, aes(x=CAP1, y=CAP2, color=Water_color, shape=Water_color)) + scale_colour_manual(values = c("Black_water" = "gray25", "Clear_water" = "lightskyblue", "White_water" = "burlywood"))+ geom_point(size=3) + geom_segment(data=vec4, inherit.aes=F, alpha = 0.5, mapping=aes(x=0, y=0, xend=adj.vec*CAP1, yend=adj.vec*CAP2), arrow=arrow(length=unit(0.2, 'cm'))) + geom_text(data=vec4, inherit.aes=F, cex=0.0001, mapping=aes(x=adj.txt*CAP1, y=adj.txt*CAP2, label=c('Ca', 'pH', 'Fe', 'DOC_Abs254/365', 'K', 'Cd', 'Silicate', 'Mn'))) + theme_bw() p2$labels$x <- ord_labels(res4)[1] p2$labels$y <- ord_labels(res4)[2] p2 ### RANDOM FOREST TO DETECT ASVS ASSOCIATED TO WATER COLORS (WITHOUT COMBINING WHITE/CLEAR) library(randomForest) #Format data for random forest: black versus others DNA1 new_DNA1 = new_DNA ## Implement Random forest algorithm DNA1 #Prepare data for RF ntaxa(new_DNA1) prunescale = 0.001 minlib = 5000 tax.mean = taxa_sums(new_DNA1)/nsamples(new_DNA1) all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, new_DNA1) ntaxa(all_muc_prune) predictors = t(otu_table(all_muc_prune)) dim(predictors) response <- as.factor(sample_data(all_muc_prune)$Water_color) rf.data <- data.frame(response, predictors) #Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression. set.seed(2) all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500) print(all_muc.classify) names(all_muc.classify) #Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables. imp <- importance(all_muc.classify) imp <- data.frame(predictors = rownames(imp), imp) imp.sort <- arrange(imp, desc(MeanDecreaseGini)) imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors) imp.50 <- imp.sort[1:40, ] write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_DNA1_NOT_COMBINED.txt") otunames <- imp.50$predictors r <- rownames(tax_table(new_DNA1)) %in% otunames purif_otus = tax_table(new_DNA1)[r, ] write.table(tax_table(new_DNA1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_DNA1_NOT_COMBINED.txt") RF_DNA1 = subset_taxa(new_DNA1, row.names(tax_table(new_DNA1)) %in% row.names(purif_otus)) #Format data for random forest: black versus others RNA1 new_RNA1 = new_RNA ## Implement Random forest algorithm RNA1 #Prepare data for RF ntaxa(new_RNA1) prunescale = 0.001 minlib = 5000 tax.mean = taxa_sums(new_RNA1)/nsamples(new_RNA1) all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, new_RNA1) ntaxa(all_muc_prune) predictors = t(otu_table(all_muc_prune)) dim(predictors) response <- as.factor(sample_data(all_muc_prune)$Water_color) rf.data <- data.frame(response, predictors) #Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression. set.seed(2) all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500) print(all_muc.classify) names(all_muc.classify) #Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables. imp <- importance(all_muc.classify) imp <- data.frame(predictors = rownames(imp), imp) imp.sort <- arrange(imp, desc(MeanDecreaseGini)) imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors) imp.50 <- imp.sort[1:40, ] write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_RNA1_NOT_COMBINED.txt") otunames <- imp.50$predictors r <- rownames(tax_table(new_RNA1)) %in% otunames purif_otus = tax_table(new_RNA1)[r, ] write.table(tax_table(new_RNA1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_RNA1_NOT_COMBINED.txt") RF_RNA1 = subset_taxa(new_RNA1, row.names(tax_table(new_RNA1)) %in% row.names(purif_otus)) #Format data for random forest: black versus others DETAILED FUNCTIONS DNA_functions1 = DNA_functions ## Implement Random forest algorithm DNA1 #Prepare data for RF ntaxa(DNA_functions1) prunescale = 0.001 minlib = 5000 tax.mean = taxa_sums(DNA_functions1)/nsamples(DNA_functions1) all_muc_prune = prune_taxa(tax.mean > prunescale*minlib, DNA_functions1) ntaxa(all_muc_prune) predictors = t(otu_table(all_muc_prune)) dim(predictors) response <- as.factor(sample_data(all_muc_prune)$Water_color) rf.data <- data.frame(response, predictors) #Compute RF. Implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression. set.seed(2) all_muc.classify <- randomForest(response~., data = rf.data, ntree = 500) print(all_muc.classify) names(all_muc.classify) #Isolate the most important variables (= importance is measured by mean decrease in GINI coefficient (measure of node purity) due to that variable) and make a phyloseq object with the 50 most important variables. imp <- importance(all_muc.classify) imp <- data.frame(predictors = rownames(imp), imp) imp.sort <- arrange(imp, desc(MeanDecreaseGini)) imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors) imp.50 <- imp.sort[1:40, ] write.table(imp.50, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_scores_DNA_functions_NOT_COMBINED.txt") otunames <- imp.50$predictors r <- rownames(tax_table(DNA_functions1)) %in% otunames purif_otus = tax_table(DNA_functions1)[r, ] write.table(tax_table(DNA_functions1)[r, ], file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_names_DNA_functions_NOT_COMBINED.txt") RF_functions1 = subset_taxa(DNA_functions1, row.names(tax_table(DNA_functions1)) %in% row.names(purif_otus)) # extract the otu tables from phyloseq otu_table_RF_DNA1 <- as.data.frame(otu_table(RF_DNA1)) write.table(otu_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_DNA_NOT_COMBINED.txt") tax_table_RF_DNA1 <- as.data.frame(tax_table(RF_DNA1)) write.table(tax_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_DNA_NOT_COMBINED.txt") meta_table_RF_DNA1 <- as.data.frame(sample_data(RF_DNA1)) write.table(meta_table_RF_DNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_DNA_NOT_COMBINED.txt") otu_table_RF_RNA1 <- as.data.frame(otu_table(RF_RNA1)) write.table(otu_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_RNA_NOT_COMBINED.txt") tax_table_RF_RNA1 <- as.data.frame(tax_table(RF_RNA1)) write.table(tax_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_RNA_NOT_COMBINED.txt") meta_table_RF_RNA1 <- as.data.frame(sample_data(RF_RNA1)) write.table(meta_table_RF_RNA1, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_RNA_NOT_COMBINED.txt") otu_table_RF_functions <- as.data.frame(otu_table(RF_functions1)) write.table(otu_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_otu_functions_NOT_COMBINED.txt") tax_table_RF_functions <- as.data.frame(tax_table(RF_functions1)) write.table(tax_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_tax_functions_NOT_COMBINED.txt") meta_table_RF_functions <- as.data.frame(sample_data(RF_functions1)) write.table(meta_table_RF_functions, file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_meta_functions_NOT_COMBINED.txt") # I reformatted manually the OTU tables to include info about water type + ASV taxonomy to the ASV abundance table # build heatmaps library(RColorBrewer) coul <- colorRampPalette(brewer.pal(8, "Greys"))(25) combined_RF_DNA <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/RF_combined_otu_tax_DNA_NOT_COMBINED.txt", header=TRUE, row.names = 1, check.names = FALSE) combined_RF_DNA_mat = as.matrix(combined_RF_DNA) heatmap(combined_RF_DNA_mat, scale="row", Rowv = NA, Colv = NA, cexCol=0.5,cexRow=0.75, margins=c(11,11), col = coul) combined_RF_RNA <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/combined_RF_otu_RNA_NOT_COMBINED.txt", header=TRUE, row.names = 1, check.names = FALSE) combined_RF_RNA_mat = as.matrix(combined_RF_RNA) heatmap(combined_RF_RNA_mat, scale="row", Rowv = NA, Colv = NA, cexCol=0.5,cexRow=0.75, margins=c(11,11), col = coul) combined_RF_functions <- read.table(file="/Users/francois-etiennesylvain/Documents/Doctorat/combined_RF_otu_functions_NOT_COMBINED.txt", header=TRUE, row.names = 1, check.names = FALSE) combined_RF_functions_mat = as.matrix(combined_RF_functions) heatmap(combined_RF_functions_mat, scale="row", Rowv = NA, Colv = NA, cexCol=0.5,cexRow=0.75, margins=c(11,11), col = coul) #### Diversity of DNA DNA_div = plot_richness(new_DNA, x="Water_color", color="Site", measures=c("Chao1", "Shannon")) DNA_div + geom_point(size=3, alpha=0.7) RNA_div = plot_richness(new_RNA, x="Water_color", color="Site", measures=c("Chao1", "Shannon")) RNA_div + geom_point(size=3, alpha=0.7) #### Barplot of relative abundance DNA.aglo = tax_glom(new_DNA, taxrank = "Phylum") DNA.dataframe = psmelt(DNA.aglo) ggplot(DNA.dataframe, aes(x=Water_color, y=Abundance, fill=Phylum)) + geom_bar(stat="identity", position="fill") + ggtitle ("Taxonomic structure, Phylum Level") RNA.aglo = tax_glom(new_RNA, taxrank = "Phylum") RNA.dataframe = psmelt(RNA.aglo) ggplot(RNA.dataframe, aes(x=Water_color, y=Abundance, fill=Phylum)) + geom_bar(stat="identity", position="fill")+ ggtitle ("Transcriptional activity, Phylum Level") #+ facet_grid(~Water_color, scale="free") ### Recompute ordisurf with RDA instead of nmds #ordisurf DNA meta_new_DNA = meta(new_DNA) groups <- meta_new_DNA$Water_color #get grouping information from meta data sit1 <- scores(res2, display='sites') df=data.frame(x=sit1[,1],y=sit1[,2],Groups=groups) #Add a dummy variable corrresponding to the selected variable meta_new_DNA$var <- meta_new_DNA$Humic.tot # FES: p-e son format a lui est meilleur #fit a surface for a selected variable onto ordination stats ordi<- vegan::ordisurf(res2,meta_new_DNA$var ,plot = FALSE, bs="ds") ordi.grid <- ordi$grid #extracts the ordisurf object #str(ordi.grid) #it's a list though - cannot be plotted as is ordi.mite <- expand.grid(x = ordi.grid$x, y = ordi.grid$y) #get x and ys ordi.mite$z <- as.vector(ordi.grid$z) #unravel the matrix for the z scores ordi.mite.na <- data.frame(na.omit(ordi.mite)) #gets rid of the nas #make the plot p_DNA<-ggplot2::ggplot()+stat_contour(data = ordi.mite.na, aes(x = x, y = y, z = z, colour = ..level..), size = 1.8, positon="identity") #can change the binwidth depending on how many contours you want p_DNA<-p_DNA+ ggplot2::geom_point(data=df,aes(x,y,fill=Groups),pch=21,size=4) p_DNA<-p_DNA+ ggplot2::scale_fill_manual(values = c("gray25", "lightskyblue", "burlywood")) p_DNA<-p_DNA+ ggplot2::scale_colour_continuous(high = "red4", low = "yellow2") #here we set the high and low of the colour scale. Can delete to go back to the standard blue, or specify others #p<-p+ ggplot2::labs(colour = paste(env.variable)) #another way to set the labels, in this case, for the colour legend p_DNA<-p_DNA+ ggplot2::theme_bw() p_DNA #ordisurf RNA meta_new_RNA = meta(new_RNA) groups <- meta_new_RNA$Water_color #get grouping information from meta data sit2 <- scores(res3, display='sites') df=data.frame(x=sit2[,1],y=sit2[,2],Groups=groups) #Add a dummy variable corrresponding to the selected variable meta_new_RNA$var <- meta_new_RNA$humic_like_DOC # FES: p-e son format a lui est meilleur #fit a surface for a selected variable onto ordination stats ordi<- vegan::ordisurf(res3,meta_new_RNA$var ,plot = FALSE, bs="ds") ordi.grid <- ordi$grid #extracts the ordisurf object #str(ordi.grid) #it's a list though - cannot be plotted as is ordi.mite <- expand.grid(x = ordi.grid$x, y = ordi.grid$y) #get x and ys ordi.mite$z <- as.vector(ordi.grid$z) #unravel the matrix for the z scores ordi.mite.na <- data.frame(na.omit(ordi.mite)) #gets rid of the nas #make the plot p_RNA<-ggplot2::ggplot()+stat_contour(data = ordi.mite.na, aes(x = x, y = y, z = z, colour = ..level..), size = 1.8, positon="identity") #can change the binwidth depending on how many contours you want p_RNA<-p_RNA+ ggplot2::geom_point(data=df,aes(x,y,fill=Groups),pch=21,size=4) p_RNA<-p_RNA+ ggplot2::scale_fill_manual(values = c("gray25", "lightskyblue", "burlywood")) p_RNA<-p_RNA+ ggplot2::scale_colour_continuous(high = "red4", low = "yellow2") #here we set the high and low of the colour scale. Can delete to go back to the standard blue, or specify others #p<-p+ ggplot2::labs(colour = paste(env.variable)) #another way to set the labels, in this case, for the colour legend p_RNA<-p_RNA+ ggplot2::theme_bw() p_RNA #### Make betadisper test to check which water color has the most dispersion Distances_DNA = phyloseq::distance(new_DNA, method = "bray") metadf_DNA = data.frame(sample_data(new_DNA)) disp.bray.watercolor.DNA = betadisper(Distances_DNA, metadf_DNA$Water_color) permutest(disp.bray.watercolor.DNA, pairwise=TRUE, permutations=1000) Distances_RNA = phyloseq::distance(new_RNA, method = "bray") metadf_RNA = data.frame(sample_data(new_RNA)) disp.bray.watercolor.RNA = betadisper(Distances_RNA, metadf_RNA$Water_color) permutest(disp.bray.watercolor.RNA, pairwise=TRUE, permutations=1000) Distances_functions = phyloseq::distance(functions_phyloseq, method = "bray") metadf_functions = data.frame(sample_data(functions_phyloseq)) disp.bray.watercolor.functions = betadisper(Distances_functions, metadf_functions$Water_color) permutest(disp.bray.watercolor.functions, pairwise=TRUE, permutations=1000)