---
  title: "Nucleotide_analysis"
  author: "Chao Jiang"
output:
  html_document: default
pdf_document: default
---
  
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, cache = TRUE, autodep = TRUE)
knitr::opts_knit$set(root.dir = "~/Documents/Bioinfo/DNAformal")
#asinh
```

extract taxon information for each sample and generate a table
standard format for all analyese except for heatmap is to list rows as samples, and columns as taxons
Load packages.
#Packages loading=============
```{r warning=FALSE, message=FALSE}
source("http://bioconductor.org/biocLite.R")
library(ggplot2)
library(reshape2)
library(edgeR)
require(NMF)
require(RColorBrewer)
library(plot3D)
library(scales)
library(Hmisc)
library(ggrepel)
library(vegan)
library(plotly)
library(cluster)
library(factoextra)
library(NbClust)
library(plyr)
library(dplyr)
library(psych)
library(glmnet)
library(devtools)
library(ggpubr)
library(tidyverse)
library(ade4)
library(caret)
library(e1071)
library(pROC)
library(gridExtra)
library(ggsci)
library(ggbeeswarm)
library(ggpmisc)
library(ggmap)
library(colorspace)
library(corrplot)
```


Input data, should be noted that the $taxonlevel$ variable here serves as the master control of entire analysis. etc. the level of analysis.
##dataloading=================
```{r dataloading}
options(stringsAsFactors = FALSE)
#====================================================
#this is the master control for individual analysis runs at different taxonomy level
mole.type = "DNA"
taxonlevel = "GENUS"

setwd(paste0("~/Documents/Bioinfo/", mole.type, "formal"))
file.dir = paste0(getwd(),"/tax/") #last slash is crucial
#====================================================

all.found.taxons = c()
sample.ids = c()
#getting rownames and colnames from the datasets

#taxconomy parsing function
taxonomy_parse = function(taxonlevel)
  {
if (taxonlevel == "SUPERKINGDOM"){#
  csv.files = dir(path = file.dir, pattern = "*.SUPERKINGDOM.csv")
  for(f in csv.files){
    table = read.csv(paste0(file.dir, f),stringsAsFactors=F,fill=TRUE)
    all.found.taxons = unique(c(all.found.taxons,table$taxon))
    sample.ids = unique(c(sample.ids,strsplit(f, "[.]")[[1]][1]))
  }
  
  csv.files = dir(path = file.dir, pattern = "*.KINGDOM.csv")
  for(f in csv.files){
    table = read.csv(paste0(file.dir, f),stringsAsFactors=F, fill=TRUE)
    all.found.taxons = unique(c(all.found.taxons,table$taxon))
    sample.ids = unique(c(sample.ids,strsplit(f, "[.]")[[1]][1]))
  }
  
  #======================initiate matrix
  master.m = matrix(0,nrow=length(all.found.taxons),ncol=length(sample.ids))
  frag.m = matrix(0, nrow=length(all.found.taxons), ncol=length(sample.ids))
  rownames(master.m) = all.found.taxons
  colnames(master.m) = sample.ids
  rownames(frag.m) = all.found.taxons
  colnames(frag.m) = sample.ids
  
  
  csv.files = dir(path = file.dir, pattern = "*.SUPERKINGDOM.csv")
  for(f in csv.files){
    in.df = read.csv(paste0(file.dir, f),stringsAsFactors=F)
    sampleID = strsplit(f, "[.]")[[1]][1]
    master.m[in.df$taxon,sampleID] = in.df$base_num
    frag.m[in.df$taxon, sampleID] = in.df$frag_num
  }
  
  csv.files = dir(path = file.dir, pattern = "*.KINGDOM.csv")
  for(f in csv.files){
    in.df = read.csv(paste0(file.dir, f),stringsAsFactors=F)
    sampleID = strsplit(f, "[.]")[[1]][1]
    master.m[in.df$taxon,sampleID] = in.df$base_num
    frag.m[in.df$taxon, sampleID] = in.df$frag_num
  }
}#


if (taxonlevel != "SUPERKINGDOM"){#
  csv.files = dir(path = file.dir, pattern = paste0("*.",taxonlevel,".csv"))
  for(f in csv.files){
    table = read.csv(paste0(file.dir, f),stringsAsFactors=F)
    all.found.taxons = unique(c(all.found.taxons,table$taxon))
    sample.ids = unique(c(sample.ids,strsplit(f, "[.]")[[1]][1]))
  }
  #===========initiate matrix
  master.m = matrix(0,nrow=length(all.found.taxons),ncol=length(sample.ids))
  frag.m = matrix(0, nrow=length(all.found.taxons), ncol=length(sample.ids))
  rownames(master.m) = all.found.taxons
  colnames(master.m) = sample.ids
  rownames(frag.m) = all.found.taxons
  colnames(frag.m) = sample.ids
  #Getting actual base_num from the datasets
  #Superkingdom first, kingdom second, because kingdom is a subset of superkingdom
  csv.files = dir(path = file.dir, pattern = paste0("*.",taxonlevel,".csv"))
  for(f in csv.files){
    in.df = read.csv(paste0(file.dir, f),stringsAsFactors=F)
    sampleID = strsplit(f, "[.]")[[1]][1]
    master.m[in.df$taxon,sampleID] = in.df$base_num
    frag.m[in.df$taxon, sampleID] = in.df$frag_num
  }
}#
  return(dfs = list("master.m" =master.m, "frag.m" = frag.m))
}

dfs = taxonomy_parse(taxonlevel)
master.m = dfs[["master.m"]]
frag.m = dfs[["frag.m"]]


#sort(colSums(master.m)/median(colSums(master.m))) #detecting samples with really low reads number
```


##normalization================
```{r normalization}
#clean super rare tags, for kingdom/super kingdom only. 
master.m = dfs[["master.m"]]
frag.m = dfs[["frag.m"]]
master.ori = master.m
if (FALSE){
  master.m = master.ori #reset master.m
}
if (taxonlevel == "SUPERKINGDOM"){
master.m = master.m[c('unclassified', 'Bacteria', 'Viruses', 'Archaea', 'Fungi', 'Viridiplantae', 'Metazoa'),]
}

master.m =master.m/150 #average read length, divided by total number of bases, this is only symbolic
master.m =master.m[order(rowSums(master.m), decreasing=TRUE),]  #Reorder master.m for better data parsing
frag.m = frag.m[order(rowSums(frag.m), decreasing=TRUE),]

#sum(frag.m["Annelida", ])/sum(frag.m["Annelida",] >= 1)# testing average contigs number for samples with >= 1 contig 

#filters that demands species has to have at least one sample with more than 3 fragments,  total > 100 reads, for at least one sample with more than 10 reads
#these filters are really only applicable for genus and species level data
if (taxonlevel != "VIRUSES"){
master.m = master.m[rowSums(master.m)>=100, ]
master.m = master.m[rowSums(frag.m[rownames(master.m),] >= 3) >= 1, ]
}
head(rowSums(master.m), 50)

if (taxonlevel == "VIRUSES" & mole.type == "DNA"){
master.m = master.m[rowSums(master.m)>=100, ]
DNA.virus.names = row.names(master.m)
} else if (taxonlevel =="VIRUSES" & mole.type == "RNA"){
  master.m = master.m[rowSums(master.m)>=100, ]
  RNA.virus.names = row.names(master.m)
}
#virus.names = unique(c(DNA.virus.names, RNA.virus.names))
#master.m = master.m[rowSums(master.m >= 10) >= 1, ]


#master.m.dgelist$counts is master.m being inputted
master.m.dgelist = DGEList(master.m)
#head(master.m.dgelist$samples)
#head(master.m.dgelist$counts)
#The difference between master.tmm and master.m.dgelist is that master.tmm has actual normalization factors
master.tmm = calcNormFactors(master.m.dgelist, method = "TMM") 
#At this point = master.tmm$samples$lib.size == colSums(master.m)
#At this point = master.tmm$counts = master.m

##I am not using normalized.lib.sizes here because TMM does not work well for our datasets , assumptions do not apply    
#master.cpm = cpm(master.tmm, normalized.lib.sizes = T, log=FALSE, prior.count=1)
#I am using prior.count=1 because this gets rid of the negative numbers after normalization, which is a pain for barplots
master.cpm = cpm(master.tmm, normalized.lib.sizes= FALSE, log=FALSE, prior.count=1)
master.cpm =master.cpm[order(rowSums(master.cpm), decreasing=TRUE),]
#master.cpm.log = cpm(master.tmm, normalized.lib.sizes = T, log=TRUE, prior.count=1)
master.cpm.log = cpm(master.tmm, normalized.lib.sizes = FALSE, log=TRUE, prior.count=1)

master.cpm.ori = master.cpm
master.cpm.log.ori = master.cpm.log

#asinh transformation is advantageious to log transformation and easier than chi-square transformation.
master.asinh.cpm = asinh(master.cpm)

master.chisquare.cpm = decostand(t(master.cpm), "chi.square") #at this point, the dataframe includes all controls and werid samples
master.chisquare.cpm = master.chisquare.cpm[,-which(rownames(master.cpm)=='unclassified')]  #further normalization for tSNE


#remove unclassified AFTER normalization， this step is definitely subjected to debate, however here unclassified cant be treated as a taxon rank
master.cpm = master.cpm[-which(rownames(master.cpm)=='unclassified'),] #at this point, the dataframe includes all controls and werid samples
master.cpm.log = master.cpm.log[-which(rownames(master.cpm.log)=='unclassified'),]
#master.cpm.log[master.cpm.log <=0] = 0  #this is to remove negative log number (artifacts of taking log)

#species abundance in each sample
speciesabundances = apply(master.asinh.cpm, 2, function(x){sum(x>0)})
#hist(speciesabundances)
min(speciesabundances)
max(speciesabundances)
speciesabundances[which.min(speciesabundances)]
speciesabundances[which.max(speciesabundances)]
if(mole.type == "DNA"){
  speciesabun.DNA = speciesabundances
  taxonabundance.DNA = rowSums(master.cpm)
  zerofraction.DNA = apply(master.cpm, 1, function(x) sum(x == 0))/ncol(master.cpm)
  mean.DNA = apply(master.cpm, 1, mean)
  var.DNA = apply(master.cpm, 1, var)
}
if (mole.type == "RNA"){
  speciesabun.RNA = speciesabundances
  taxonabundance.RNA = rowSums(master.cpm)
  zerofraction.RNA = apply(master.cpm, 1, function(x) sum(x == 0))/ncol(master.cpm)
  mean.RNA = apply(master.cpm, 1, mean)
  var.RNA = apply(master.cpm, 1, var)
}

#how many 0s for each taxon
hist(master.m[3,])
zerofraction = apply(master.m, 1, function(x) sum(x == 0))/ncol(master.m)
hist(zerofraction, xlab="proportion of 0s per taxon", main="zero-inflated taxon abundance data")

ggplot(data.frame(zerofraction=zerofraction), aes(x = zerofraction)) + geom_histogram() + xlab("proportions of 0s per taxon") + ggtitle(paste0(mole.type, "-", taxonlevel)) + theme_pubr()


if(FALSE){
combined.species.df = data.frame(Richness = c(speciesabun.DNA, speciesabun.RNA), Type = c(rep("DNA", length(speciesabun.DNA)), rep("RNA", length(speciesabun.RNA))))

ggdensity(combined.species.df, x = "Richness",
   #add = "median", 
   #add.params=list(color="red"),
   rug = TRUE,
   color = "Type", 
   fill = "Type",
   palette = c("#00AFBB", "#E7B800"),
   ) + labs(x=paste0(tolower(taxonlevel), " richness")) + theme_pubr(base_size = 16)

combined.abundance.df = data.frame(Abundance = c(log(taxonabundance.DNA), log(taxonabundance.RNA)), Type = c(rep("DNA", length(taxonabundance.DNA)), rep("RNA", length(taxonabundance.RNA))))

ggdensity(combined.abundance.df, 
          x = "Abundance",
          add = "median", 
          #add.params=list(color="red"),
          rug = TRUE,
          color = "Type", 
          fill = "Type",
          palette = c("#00AFBB", "#E7B800"),
          ) + labs(x=paste0("log(Total ", tolower(taxonlevel), " abundance in CPM)")) + theme_pubr(base_size = 16)

combined.zerofraction.df = data.frame(Zerofraction = c(zerofraction.DNA, zerofraction.RNA), Type = c(rep("DNA", length(zerofraction.DNA)), rep("RNA", length(zerofraction.RNA))))

ggdensity(combined.zerofraction.df, 
          x = "Zerofraction",
          add = "median", 
          #add.params=list(color="red"),
          rug = TRUE,
          color = "Type", 
          fill = "Type",
          palette = c("#00AFBB", "#E7B800")
          ) + labs(x=paste0("Zero fractions of ", tolower(taxonlevel), " in all samples")) + theme_pubr(base_size = 16)

#RNA_mean = mean.RNA, RNA_var = var.RNA, 
#mean variance relationship
ggplot(data.frame( DNA_mean = mean.DNA, DNA_var = var.DNA), aes(x = log(DNA_mean), y = log(DNA_var))) + 
  geom_point(shape = 21, color = "#00AFBB", fill = "#00AFBB", alpha = 0.3) + 
  geom_point(data = data.frame(RNA_mean = mean.RNA, RNA_var = var.RNA), aes(x = log(RNA_mean), y=log(RNA_var)), shape = 21, color = "#E7B800", fill = "#E7B800", alpha = 0.3) + 
  stat_smooth(method = "lm", formula = y ~ x, color = "#00AFBB") + 
  stat_smooth(data = data.frame(RNA_mean = mean.RNA, RNA_var = var.RNA), aes(x = log(RNA_mean), y=log(RNA_var)), method = "lm", formula = y~x, color = "#E7B800") + 
  labs( x = "log(mean of genus abundance (CPM) across samples)", y = "log(variance of genus abundance (CPM) acorss samples)") + theme_pubr(base_size = 11) 
}


ggplot(data.frame(speciesabundances = speciesabundances), aes(x=speciesabundances)) + geom_histogram() + xlab("species richness") + theme_pubr()


ggdensity(data = data.frame(abundance = log(rowSums(master.cpm), base=10)), x = "abundance",
   add = "median", 
   color = "#00AFBB",
   fill = "#00AFBB",
   rug = TRUE,
   ) + labs(x=paste0("Total ", tolower(taxonlevel), " abundance in log(CPM)")) + theme_pubr(base_size = 16) 


```


##METAdata parsing=========================================
THERE IS NO NEED TO RERUN THIS PART FOR EVERY ANALYSIS UNLESS ITS SWITCHING BETWEEN DNA and RNA
THIS PART IS VERY COMPLICATED DUE TO HISTORICAL UPDATES
```{r metadata parsing}
# Opens multiple CSV files for metadata extraction
# this uses local files from the laptop
sampletofilter = read.csv("/Users/jiangch/Documents/Bioinfo/places/lib-filter.csv", stringsAsFactors=F, fill = TRUE)
#the index of colnames(master.m) in sampletofilter$sample.old
#this is basically the names of samples translated into filter number
if (mole.type == "DNA"){
filter_list = sampletofilter$filter[match(colnames(master.m),sampletofilter$sample.old)]
batch = sampletofilter$batch[match(colnames(master.m), sampletofilter$sample.old)]
formalname = sampletofilter$sample[match(colnames(master.m), sampletofilter$sample.old)]
oldname = sampletofilter$sample.old[match(colnames(master.m), sampletofilter$sample.old)]
}
if (mole.type == "RNA"){
filter_list = sampletofilter$filter[match(colnames(master.m),sampletofilter$sample.xin.old)]
batch = sampletofilter$batch[match(colnames(master.m), sampletofilter$sample.xin.old)]
formalname = sampletofilter$sample.xin[match(colnames(master.m), sampletofilter$sample.xin.old)]
oldname = sampletofilter$sample.xin.old[match(colnames(master.m), sampletofilter$sample.xin.old)]
}
#this is get everything

filterinfo = read.csv("/Users/jiangch/Documents/Bioinfo/places/filter-informationnew_xin.csv", stringsAsFactors=F)
devicemeta = read.csv("/Users/jiangch/Documents/Bioinfo/places/device_metadata.new.csv", stringsAsFactors=F)

#importing RNA amplification results
RNAamp = read.csv("~/Documents/Bioinfo/places/RNAamp_result.csv", header = TRUE)

#importing from Lora's datasets
weather = readRDS("/Users/jiangch/Documents/Bioinfo/places/weather_events.rds")

#newest environ
environ.recent = readRDS("~/Documents/Bioinfo/places/newest_environ.rds")


#checking dates extracted from device compared to filter-information.csv
#looks like filter-information is more reliable

weather_matching = weather$X_Events[match(filter_list, weather$Filter_No)]
sick_matching = sickfilter$remarks[match(filter_list, sickfilter$Filter_No)]


#relevant filters in this dataset
filterinfo.match = join(data.frame(Filter.No.=filter_list),  filterinfo, by = "Filter.No.")
#all fake entries, caused by blank filters
#filterinfo.match[rowSums(is.na(filterinfo.match)) == 9, ] 
old.location =  filterinfo.match$comments
location = filterinfo.match$location
geo = filterinfo.match$geo
geo2 = filterinfo.match$geo2
#the owner of each filter
ownership = filterinfo.match$Patient.ID

#date related parsing
date.start = filterinfo.match$filter.start
date.start = as.Date(date.start, "%m/%d/%y")
date.end = filterinfo.match$filter.collect
date.end = as.Date(date.end, "%m/%d/%y")
#Duration of each filter, this information is not as accurate as device measured duration.
#duration = date.end - date.start
#collected duration, this is probably the most accurate estimates
devicemeta.match = join(data.frame(filter=filter_list), devicemeta)
duration = devicemeta.match$duration

#this information is used to date individual sample as of now. Unless better ways are found
date.mid = date.start + (date.end - date.start)/2

#getting particle information from the device
#There are a few filters with average particle concentration less than 0, a simple abs() is used here. potentially need more refining
particle = abs(devicemeta.match$average_particle)
median.particle = devicemeta.match$median_particle
sd.particle = devicemeta.match$sd_particle
total.particle = particle * duration

#getting temperature information for each filter from the device metadata
temperature = devicemeta.match$average_temp
median.temperature = devicemeta.match$median_temp
sd.temperature = devicemeta.match$std_temp

#getting information about humidity for each filter from the device metadata
humid = devicemeta.match$average_humid
median.humid = devicemeta.match$median_humid
sd.humid = devicemeta.match$std_humid

date.month = format(date.mid, "%m")
date.month = as.numeric(date.month)
date.day = as.numeric(format(date.mid, "%d"))

library(plyr)
#Mapping to weekdays and weekends
weekdays1 = c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday')

#assigning weekend index the difficult and precise way
#
weekend = c()
for (i in seq(1:length(filter_list))){
  if (weekdays(date.start[i]) %in% c('Thursday', 'Friday') & weekdays(date.end[i]) %in% c('Monday', "Tuesday") & (date.end[i] - date.start[i])<=5 )
  { 
    weekend = c(weekend, "weekend")
    print("ok")
    print(c(date.start[i], weekdays(date.start[i])))
    print(c(date.end[i], weekdays(date.end[i])))
    print(date.end[i] - date.start[i])
  }
  else if (is.na(date.start[i]))
  {
    weekend = c(weekend, NA)
  }
  else
  {
    weekend = c(weekend, "weekday")
    #print("nah")
  }
}

#this is the conventional definition of seasons in northern hemisphere

season = mapvalues(as.numeric(date.month),from=as.numeric(c(seq(3,12),1,2)),to=c(rep("spring",3),rep("summer",3),rep("fall",3),rep("winter",3)))

#dataframe from device
meta_data = data.frame(batch,location,geo, geo2, ownership, duration, date.month, date.start, date.end, season, date.mid, particle, sd.particle, median.particle, total.particle, temperature, median.temperature, sd.temperature, humid, sd.humid, median.humid, Filter_No=filter_list, weekend)


environ = join(meta_data, comc, by="Filter_No")
environ = join(environ, RNAamp[,1:3], by="Filter_No")

environ = environ[!duplicated(environ),] #same filter information for replicates of the same filter
environ = environ[order(environ$date.mid, environ$date.start),]
environ.mike = subset(environ, ownership=="Mike")


#detailed information of sickfilters
sickfilter = data.frame(sickfilter, date.mid = environ$date.mid[match(sickfilter$Filter_No, environ$Filter_No)], date.start = environ$STARTING_DATE[match(sickfilter$Filter_No, environ$Filter_No)])


#updating environ
environ.input = readRDS("~/Documents/Bioinfo/places/new_environ.rds")
environ.input = environ.input[order(as.Date(environ.input$date.mid, "%m/%d/%Y"), as.Date(environ.input$date.start, "%m/%d/%Y")),] #unify ordering
environ[ ,c(27:44, 49, 50)] =environ.input[ ,c(27:44, 49, 50)]
environ$X_Mean_Humidity = as.numeric(environ$X_Mean_Humidity)

#hiking days
environ$hiking = environ$Filter_No
hikingfilters = c("2","3","5","9","13","43","58","123","124","131","146","192","201","212","326","327","332","341")
environ$hiking = mapvalues(environ$hiking,from=hikingfilters,to=rep("hiking", length(hikingfilters)))
environ$hiking = mapvalues(environ$hiking, from=environ$hiking[-grep("hiking|B", environ$hiking)], to=rep("non-hiking", length(environ$hiking[-grep("hiking|B", environ$hiking)])))

#coastal-inland
input2 = read.csv("~/Documents/Bioinfo/places/environ-new-xin.csv", header = TRUE, stringsAsFactors = FALSE)
input2 = input2[order(as.Date(input2$date.mid, "%m/%d/%Y"), as.Date(input2$date.start, "%m/%d/%Y")),]#unify ordering
environ$geo3 = input2$geo3 

#annonymous ownership
environ$aownership = environ$ownership
environ$aownership = mapvalues(environ$aownership, from = c(unique(environ$ownership)), to = c("P1", "P2", "Others", "Others", "Others", "P8", "Others", "P3", "P7", "Others", "Others", "Others", "Others","P5", "P4", "P6", "Others", NA))
                                                                          

#recent updates
population = read.csv("~/Documents/Bioinfo/places/combined_data_New20171010.csv", stringsAsFactors = FALSE)
#population$population2 = as.numeric(population$population2)
#population$Population_density_people_per_sqmi = as.numeric(population$Population_density_people_per_sqmi)
air = read.csv("~/Documents/Bioinfo/places/comb.csv", stringsAsFactors = FALSE)
environ = join(environ, population, by="Filter_No")
environ = join(environ, air, by="Filter_No")

#fixes
environ$is_there_rain = as.numeric(environ$is_there_rain)

if (FALSE){
#total correlation
environ.numeric = Filter(is.numeric, environ)
environ.numeric = as.matrix(environ.numeric)

res2 = rcorr(environ.numeric, type = "pearson")

#takes output from rcorr function, adjusting p value matrix accroding to fdr principles
matrixPadjust = function(res, select = c(1:nrow(res$P))){
  print(select)
  temp.p = c(res$P)
  temp.p[is.na(temp.p)] = 0
  temp.p.adj = p.adjust(temp.p, method = "fdr")
  p.adj.matrix = matrix(temp.p.adj, nrow = ncol(res$P), ncol=ncol(res$P), dimnames = list(rownames = colnames(res$P), colnames = colnames(res$P)))
  return(p.adj.matrix)
}
adjusted.res2.p = matrixPadjust(res2)
#pls package also has a function called corrplot
corrplot::corrplot(res2$r, order = "hclust", p.mat = adjusted.res2.p, sig.level = 0.01, insig = "blank", diag = FALSE, tl.cex=0.5, addrect = 17, method="color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black")
}
```


##PCA for all===================
```{r pca analysis}
#==============PCA analysis
#log, no scale
pca = dudi.pca(t(master.cpm), scale=FALSE, scannf=FALSE, nf=3)
#pca = dudi.pca(t(Mike.master.cpm), scale=FALSE, scannf=FALSE, nf=3)
fviz_screeplot(pca, addlabels = TRUE )
summary(pca)

var.pca <- get_pca_var(pca)
head(var.pca$contrib,30) #contributions of variables to individual PC. each column sum to 100

#this method seems to be better for presentation purpose than picking from contrib=18
pca.names = c(head(rownames(var.pca$contrib)[order(var.pca$contrib[,1], decreasing = TRUE)],6), head(rownames(var.pca$contrib)[order(var.pca$contrib[,2], decreasing = TRUE)],6),head(rownames(var.pca$contrib)[order(var.pca$contrib[,3], decreasing = TRUE)],6))
pca.names =unique(pca.names)

test.names = c(head(rownames(pca$c1)[order(pca$c1$CS1)]),tail(rownames(pca$c1)[order(pca$c1$CS1)]),
               head(rownames(pca$c1)[order(pca$c1$CS2)]),tail(rownames(pca$c1)[order(pca$c1$CS2)]),
               head(rownames(pca$c1)[order(pca$c1$CS3)]),tail(rownames(pca$c1)[order(pca$c1$CS3)]))

fviz_pca_var(pca, col.var="contrib", geom=c("point","text"), select.var = list(name=test.names),
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE) +coord_fixed(sqrt(pca$eig[2] / pca$eig[1]))#plot features(loadings)
fviz_pca_var(pca, col.var="contrib", geom=c("point","text"), select.var = list(name=test.names),
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE, axes = c(1,3)) +coord_fixed(sqrt(pca$eig[3] / pca$eig[1]))#plot features(loadings)

fviz_pca_ind(pca, label = "none", pointsize=2.0, alpha.ind = 0.5,  col.ind = "contrib") +coord_fixed(sqrt(pca$eig[2] / pca$eig[1]))
```

This part is aimed at filtering data into many data frames, could consider split this part into individual analysis part

##data_filtering=================
```{r data_filtering}
scores.temp = data.frame(pca$li[,1:3],Filter_No=filter_list, samplenames=rownames(pca$li))
scores = join(scores.temp, environ, by = "Filter_No")
scores = scores[!duplicated(scores),]  #remove duplicated rows
#sort by dates
scores = scores[order(scores$date.mid, scores$date.start),] #making the ordering unique
#remove all filters without starting dates, so blanks and filter 140
scores_blank = scores[is.na(scores$date.mid),]
#this keeps all blanks
scores_blank = scores_blank[!scores_blank$Filter_No == "140",]

#scores_complete now refer to all sample filters
scores_complete = scores[!is.na(scores$date.start),]
scores_complete$season = factor(scores_complete$season, levels=c("spring", "summer", "fall","winter"))


if(mole.type == "DNA"){
scores_single_complete = scores_complete[-grep("double|I15_2|I15_3", scores_complete$samplenames, ignore.case=TRUE), ] 
} else {
scores_single_complete = scores_complete
}

scores_single_complete$campus_ornot = ifelse(scores_single_complete$location %in% c("Campus-home", "Campus", "Mike_office-background", "Mike_home-background", "Mike_background" ), "Campus", "Non-campus")

#Mike = scores[grep("Mike", scores$ownership, ignore.case = TRUE),]
Mike_single = scores_single_complete[grep("Mike", scores_single_complete$ownership, ignore.case = TRUE),]

Patient1 = scores_single_complete[grep("No1_1636-69-063", scores_single_complete$ownership,ignore.case=TRUE),]

Patient3 = scores_single_complete[grep("No3_San Mateo", scores_single_complete$ownership, ignore.case=TRUE),]

campushome = Mike_single[grep("campus|home|Mike|office|weekend|desk|outdoor", Mike_single$location, ignore.case = TRUE),]
noncampushome = Mike_single[-grep("campus|home|Mike|office|weekend|desk|outdoor", Mike_single$location, ignore.case = TRUE),]

#non-mike filters
others = scores_single_complete[-grep("Mike", scores_single_complete$ownership, ignore.case=TRUE), ]

#four people studies
fourpeople = others[others$ownership %in% c("Gw", "No3_San Mateo", "zhixin_fremont", "Guan_SF"), ]
fourpeople = fourpeople[fourpeople$date.start > as.Date("2016-11-20"), ]
mike.fourpeople = Mike_single[Mike_single$date.start > as.Date("2016-11-20") & Mike_single$date.start<as.Date("2016-12-10"),]
fourpeople= rbind(fourpeople, mike.fourpeople)
fourpeople.tibble = as_tibble(fourpeople)
fourpeople.select = fourpeople[fourpeople$ownership!="zhixin_fremont",]


#Mike north america
Mike_NAmerica = Mike_single[Mike_single$geo == "uswest" | Mike_single$geo == "useast",]

#Mike north america but not in bay area, this is intended for external validation set
Mike_NAnonbay = Mike_NAmerica[!(Mike_NAmerica$Filter_No %in% campushome$Filter_No),]

#Mike california samples but not in bay area, this is intended for external validation set
Mike_CAnonbay = Mike_NAmerica[grep("-CA", Mike_NAmerica$location),]

#master.cpms
master.cpm.single = master.cpm[, scores_single_complete$samplenames]
master.cpm.single.log = master.cpm.log[, scores_single_complete$samplenames]
master.asinh.cpm.single = master.asinh.cpm[, scores_single_complete$samplenames]

master.asinh.cpm.single.fifty = master.asinh.cpm.single[rowSums(master.asinh.cpm.single > 0) >= 50,]
master.asinh.cpm.single.hundred = master.asinh.cpm.single[rowSums(master.asinh.cpm.single > 0) >= 100,]
#saving for DNA-RNA correlations
if (mole.type == "DNA"){ 
  master.DNA.asinh.cpm.single = master.asinh.cpm.single
  master.DNA.cpm.single = master.cpm.single
  scores.DNA.single = scores_single_complete
}
if (mole.type == "RNA"){
  master.RNA.asinh.cpm.single = master.asinh.cpm.single
  master.RNA.cpm.single = master.cpm.single
  scores.RNA.single = scores_single_complete
}


#new data frame to store Mike's samples taxonomy information in timely order
Mike.master.cpm = master.cpm.single[, Mike_single$samplenames]
Mike.master.cpm.log = master.cpm.single.log[, Mike_single$samplenames]
Mike.master.asinh.cpm = master.asinh.cpm.single[, Mike_single$samplenames]

#Mike NAmerica dataset
Mike_NAmerica.master.cpm = master.cpm.single[, Mike_NAmerica$samplenames]
Mike_NAmerica.master.cpm.log = master.cpm.single.log[, Mike_NAmerica$samplenames]
Mike_NAmerica.master.asinh.cpm = master.asinh.cpm.single[, Mike_NAmerica$samplenames]

#Mike NAnonbay dataset
Mike_NAnonbay.master.cpm = master.cpm.single[, Mike_NAnonbay$samplenames]
Mike_NAnonbay.master.cpm.log = master.cpm.single.log[, Mike_NAnonbay$samplenames]
Mike_NAnonbay.master.asinh.cpm = master.asinh.cpm.single[, Mike_NAnonbay$samplenames]

#Mike CAnonbay dataset
Mike_CAnonbay.master.cpm = master.cpm.single[, Mike_CAnonbay$samplenames]
Mike_CAnonbay.master.cpm.log = master.cpm.single.log[, Mike_CAnonbay$samplenames]
Mike_CAnonbay.master.asinh.cpm = master.asinh.cpm.single[, Mike_CAnonbay$samplenames]

#new data frame to store Mike's campushome samples taxonomy information in timely order
campushome.master.cpm = master.cpm.single[, campushome$samplenames]
campushome.master.cpm.log = master.cpm.single.log[, campushome$samplenames]
campushome.master.asinh.cpm = master.asinh.cpm.single[, campushome$samplenames]

#new data frame to store Mike's non-campushome data
noncampushome.master.cpm = master.cpm.single[, noncampushome$samplenames]
noncampushome.master.cpm.log = master.cpm.single.log[, noncampushome$samplenames]
noncampushome.master.asinh.cpm = master.asinh.cpm.single[, noncampushome$samplenames]

#patient1
patient1.master.cpm = master.cpm.single[, Patient1$samplenames]
patient1.master.cpm.log = master.cpm.single.log[, Patient1$samplenames]
patient1.master.asinh.cpm = master.asinh.cpm.single[, Patient1$samplenames]

#patient3
patient3.master.cpm = master.cpm.single[, Patient3$samplenames]
patient3.master.cpm.log = master.cpm.single.log[, Patient3$samplenames]
patient3.master.asinh.cpm = master.asinh.cpm.single[, Patient3$samplenames]


#others
others.master.cpm = master.cpm.single[, others$samplenames]
others.master.cpm.log = master.cpm.single.log[, others$samplenames]
others.master.asinh.cpm = master.asinh.cpm.single[, others$samplenames]

#fourpeople studies
four.master.cpm = master.cpm.single[, fourpeople$samplenames]
four.master.cpm.log = master.cpm.single.log[, fourpeople$samplenames]
four.master.asinh.cpm = master.asinh.cpm.single[, fourpeople$samplenames]

#fourpeople select 
fourpeople.select.master.cpm = master.cpm.single[, fourpeople.select$samplenames]
fourpeople.select.master.cpm.log = master.cpm.single.log[, fourpeople.select$samplenames]
fourpeople.select.master.asinh.cpm = master.asinh.cpm.single[, fourpeople.select$samplenames]

#combing data with contextual variables(metadata), 
#should be noted that while master.cpm list samples as columns, scores_complete list samples as rows!!!!
master_combined_complete = data.frame(scores_single_complete, t(master.cpm.single))
P1_combined_complete = data.frame(Mike_single, t(Mike.master.cpm))
#chisquare transformations
master.chisquare.cpm.single = master.chisquare.cpm[scores_single_complete$samplenames,]
patient1.chisquare.cpm = master.chisquare.cpm[Patient1$samplenames,]
Mike.chisquare.cpm = master.chisquare.cpm[Mike_single$samplenames,]
#reoder season levels, this is intended for ggplot legend ordering of seasons
master_combined_complete$season = factor(master_combined_complete$season, levels=c("spring", "summer", "fall","winter"))

#for estimation of dog. cat, guinea pig abundance
sum(master.asinh.cpm.single["Canis",])/sum(master.asinh.cpm.single) *100
sum(master.asinh.cpm.single["Felis",])/sum(master.asinh.cpm.single) *100
sum(master.asinh.cpm.single["Cavia",])/sum(master.asinh.cpm.single) *100

#================================this section is for anosim analysis
if(FALSE){
test.ano = rbind(Mike.chisquare.cpm, patient1.chisquare.cpm)
test.group = c(scores_single_complete[row.names(Mike_single), "ownership"], scores_single_complete[row.names(Patient1), "ownership"])
test.result = anosim(test.ano, test.group)
test.result
}


nulc.phyla.abundance = rowSums(master.asinh.cpm.single)[-1]/sum(rowSums(master.asinh.cpm.single)[-1])
for (i in 1:20){
  print(i)
  print(sum(head(nulc.phyla.abundance, i)))}

```

#Generating timeline plot==========
```{r timeline plot}
#four owners
#creating sub-dataframe for Mike and P1
environ$season = factor(environ$season, levels=c("spring", "summer", "fall", "winter"))
#P1
environ.mike = subset(environ, ownership=="Mike")
environ.p1 = subset(environ, ownership=="No1_1636-69-063")
environ.gw = subset(environ, ownership == "Gw")
environ.zhixin = subset(environ, ownership == "zhixin_fremont")
environ.no3 = subset(environ, ownership == "No3_San Mateo")
environ.ting = subset(environ, ownership == "Wang Ting")
environ.xin = subset(environ, ownership == "Xin")
environ.sf = subset(environ, ownership == "Guan_SF")
environ.relevant = subset(environ, ownership %in% c("Mike","No1_1636-69-063","Gw","zhixin_fremont" ,"No3_San Mateo", "Wang Ting","Xin","Guan_SF"))
environ.others = subset(environ, !(ownership %in% c("Mike","No1_1636-69-063","Gw","zhixin_fremont" ,"No3_San Mateo", "Wang Ting","Xin","Guan_SF")) & !is.na(ownership))


#multiplot of timeline
pointsize = 5 #control size of points
color.choice = "season"
start.date = as.Date("2014-06-10")


#timeline
timeline = ggplot(environ.mike, aes(x=date.mid, y=0, color=get(color.choice)))
timeline + geom_point(data = environ.mike, shape=20, size=pointsize, alpha=0.5) + 
  annotate("text", x = start.date, y = 0, label = "P1 n=201", color= "#00A087FF", fontface=2) +
  
  geom_point(data= environ.p1, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.1, color=get(color.choice))) +
  annotate("text", x = start.date, y = 0.1, label = paste0("P2 n=", dim(environ.p1)[1]), color= "#3C5488FF", fontface=2) +
  
    geom_point(data= environ.no3, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.2, color=get(color.choice))) +
  annotate("text", x = start.date, y = 0.2, label = paste0("P3 n=", dim(environ.no3)[1]), color ="#E64B35FF",fontface=2) +
  
  
  geom_point(data= environ.zhixin, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.3, color=get(color.choice))) +
    annotate("text", x = start.date, y = 0.3, label = paste0("P4 n=", dim(environ.zhixin)[1])) +
  
  geom_point(data= environ.gw, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.4, color=get(color.choice))) +
  annotate("text", x = start.date, y = 0.4, label = paste0("P5 n=", dim(environ.gw)[1])) +
  
  geom_point(data= environ.sf, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.5, color=get(color.choice))) +
  annotate("text", x = start.date, y = 0.5, label = paste0("P6 n=", dim(environ.sf)[1])) +  
  geom_point(data= environ.ting, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.6, color=get(color.choice))) +
  annotate("text", x = start.date, y = 0.6, label = paste0("P7 n=", dim(environ.ting)[1])) + 
  geom_point(data= environ.xin, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.7, color=get(color.choice))) +
  annotate("text", x = start.date, y = 0.7, label = paste0("P8 n=", dim(environ.xin)[1])) + 
  geom_point(data= environ.others, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.8, color=get(color.choice))) +
  annotate("text", x = as.Date("2014-06-25"), y = 0.8, label = paste0("Others n=", dim(environ.others)[1])) + 
  
  annotate("text", x = as.Date("2014-06-20"), y = -0.1, label = paste0("Total N=", nrow(environ.others)+nrow(environ.relevant)))+
  scale_x_date(limits = c(as.Date("2014-05-20"), as.Date("2017-01-01")), labels=date_format("%Y-%m"), date_breaks = "3 months") +
 labs(color="Season", x="Date", y = "Individual")+
  ylim(c(-0.2,0.9))+
scale_color_manual(values=c("spring" = "#00A087FF", "summer" = "#4DBBD5FF", "fall" = "#E64B35FF", "winter" = "#3C5488FF")) + 
  theme_pubr(base_size = 16, x.text.angle = 30, margin = FALSE) +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank(), 
        axis.text.x = element_text(size = 12))


#geom_hline(yintercept=-0.00) +
```


#overall PCA plots, non-normalized
```{r overal PCA plots, non-normalized}
#Mike.master.asinh.cpm
pca = dudi.pca(t(master.cpm.single), scale=FALSE, scannf=FALSE, nf=3)
fviz_screeplot(pca, addlabels = TRUE )
summary(pca)

var.pca <- get_pca_var(pca)
head(var.pca$contrib,30) #contributions of variables to individual PC. each column sum to 100

#this method seems to be better for presentation purpose than picking from contrib=18

test.names = c(head(rownames(pca$c1)[order(pca$c1$CS1)]),tail(rownames(pca$c1)[order(pca$c1$CS1)]),
               head(rownames(pca$c1)[order(pca$c1$CS2)]),tail(rownames(pca$c1)[order(pca$c1$CS2)]),
               head(rownames(pca$c1)[order(pca$c1$CS3)]),tail(rownames(pca$c1)[order(pca$c1$CS3)]))
test.names =unique(test.names)

fviz_pca_var(pca, col.var="contrib", geom=c("point","text"), select.var = list(name=test.names),
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE) +coord_fixed(sqrt(pca$eig[2] / pca$eig[1]))#plot features(loadings)
fviz_pca_var(pca, col.var="contrib", geom=c("point","text"), select.var = list(name=test.names),
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE, axes = c(1,3)) +coord_fixed(sqrt(pca$eig[3] / pca$eig[1]))#plot features(loadings)

fviz_pca_ind(pca, label = "none", pointsize=2.0, alpha.ind = 0.5,  col.ind = "contrib") +coord_fixed(sqrt(pca$eig[2] / pca$eig[1]))

fviz_pca_ind(pca, label = "none", axes = c(1,3), pointsize=2.0, alpha.ind = 0.5,  col.ind = "contrib") +coord_fixed(sqrt(pca$eig[3] / pca$eig[1]))

bi.plot = fviz_pca_biplot(pca, 
                # Individuals
                geom.ind = c("point"),
                geom.var = c("arrow", "text"),
                #fill.ind = ok$geo2, 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = test.names),
                pointshape = 21, pointsize = 5,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.3 ,
                alpha.ind = 0.7,
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"),
                label.size = 5,
                #addEllipses = TRUE,
                #ellipse.type = "euclid",
                #ellipse.level = 0.60,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Geo", color = "Contrib", alpha = "Contrib")  +
    coord_fixed(sqrt(pca$eig[2] / pca$eig[1])) +
  theme_pubr(base_size = 14) 
print(bi.plot)
```


##overall DNA-RNA correlation
```{r overall DNA-RNA correlation}
if (mole.type == "DNA"){ 
  master.DNA.asinh.cpm.single = master.asinh.cpm.single
  master.DNA.cpm.single = master.cpm.single
  scores.DNA.single = scores_single_complete
  master.DNA.asinh.cpm.single.copy = master.asinh.cpm.single
}
if (mole.type == "RNA"){
  master.RNA.asinh.cpm.single = master.asinh.cpm.single
  master.RNA.cpm.single = master.cpm.single
  scores.RNA.single = scores_single_complete
  master.RNA.asinh.cpm.single.copy = master.asinh.cpm.single
}
#save copies for furth uses

#resetting
master.DNA.asinh.cpm.single = master.DNA.asinh.cpm.single.copy
master.RNA.asinh.cpm.single = master.RNA.asinh.cpm.single.copy

if ( taxonlevel == "PHYLUM"){
master.DNA.asinh.cpm.select = master.DNA.asinh.cpm.single[2:21,]
master.RNA.asinh.cpm.select = master.RNA.asinh.cpm.single[rownames(master.DNA.asinh.cpm.single)[2:21],]
newdnanames = paste0("DNA_",rownames(master.DNA.asinh.cpm.select))
newrnanames = paste0("RNA_", rownames(master.RNA.asinh.cpm.select))
rownames(master.DNA.asinh.cpm.select) = newdnanames
rownames(master.RNA.asinh.cpm.select) = newrnanames

overallres = rcorr(t(master.DNA.asinh.cpm.select[, ]), t(master.RNA.asinh.cpm.select[, ]), type="spearman")

m = corrplot::corrplot(overallres$r[1:20,21:40],  p.mat = overallres$P[1:20,21:40], sig.level = 0.05, insig = "blank", tl.cex=0.8, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1))

m = corrplot::corrplot(overallres$r,  p.mat = overallres$P, sig.level = 0.05, insig = "blank", tl.cex=0.6, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1))

}

if (taxonlevel == "SUPERKINGDOM"){
master.DNA.asinh.cpm.single = master.DNA.asinh.cpm.single[order(rownames(master.DNA.asinh.cpm.single)), ]
master.RNA.asinh.cpm.single = master.RNA.asinh.cpm.single[order(rownames(master.RNA.asinh.cpm.single)), ]
master.DNA.asinh.cpm.single = master.DNA.asinh.cpm.single[-5,]
master.RNA.asinh.cpm.single = master.RNA.asinh.cpm.single[-5,]
newdnanames = paste0("DNA_",rownames(master.DNA.asinh.cpm.single))
newrnanames = paste0("RNA_", rownames(master.RNA.asinh.cpm.single))
rownames(master.DNA.asinh.cpm.single) = newdnanames
rownames(master.RNA.asinh.cpm.single) = newrnanames


overallres = rcorr(t(master.DNA.asinh.cpm.single[, ]), t(master.RNA.asinh.cpm.single[, ]), type="spearman")
#overallres = rcorr(t(scale(master.DNA.asinh.cpm.single[, ])), scale(t(master.RNA.asinh.cpm.single[, ])), type="spearman")

m = corrplot::corrplot(overallres$r[1:6,7:12],  p.mat = overallres$P[1:6,7:12], sig.level = 0.05, insig = "blank", tl.cex=1, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1))

m = corrplot::corrplot(overallres$r,  p.mat = overallres$P, sig.level = 0.05, insig = "blank", tl.cex=0.6, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1))
}


#different way of visulization

test.vector = c(overallres$r[1,8], overallres$r[2,9], overallres$r[3,10], overallres$r[4,11], overallres$r[6, 13], overallres$r[7,14])

p.vector = c(overallres$P[1,8], 1e-20, overallres$P[3,10], overallres$P[4,11], overallres$P[6, 13], overallres$P[7,14])
ggplot(data.frame(name = sort(rownames(master.asinh.cpm.single))[-5], cor = test.vector, p = p.vector), aes(x = name, y=cor, fill = name,  size=-log(p, base =10))) + geom_point(shape = 21) + theme_pubr(base_size = 18)+ rremove("x.text") + rremove("xlab") + scale_fill_d3() + labs(fill = "Domain", size = "-log(p)")

```

##correlation matrix======================correlation matrix + p values for taxon labels
```{r correlation matrix of taxons}
#at least 5 samples have more than 100 CPM
#for genus
keep=seq(1:nrow(master.asinh.cpm.single))
if (taxonlevel == "GENUS" | taxonlevel == "SPECIES" | taxonlevel == "FAMILY" | taxonlevel == "ORDER"){
  correlation.df = master.asinh.cpm.single[rowSums(master.asinh.cpm.single >0) >= 283*0.1,]
  correlation.df = correlation.df[apply(correlation.df, 1, var)>2,]
  keep = rownames(correlation.df)
} else if (taxonlevel == "PHYLUM"){
  correlation.df = master.asinh.cpm.single[rowSums(master.asinh.cpm.single >0) > 0,]
  correlation.df = correlation.df[apply(correlation.df, 1, var)>1,]
  keep = rownames(correlation.df)
}


#master.temp = master.cpm.single[keep,]
master.temp = master.asinh.cpm.single[keep,]
master.temp = master.temp[order(rowSums(master.temp), decreasing = TRUE),]
res2 = rcorr(t(master.temp), type="pearson")
p.adj.matrix2 = matrixPadjust(res2)

#correlations threshold set at 0.7
#res2$r = res2$r[order(rowSums(res2$r>0.7, na.rm=TRUE), decreasing = TRUE), ]
#select = rowSums(res2$r>0.7, na.rm=TRUE) > 2

#m = corrplot::corrplot(res2$r[select, select], order = "hclust", p.mat = p.adj.matrix2[select, select], sig.level = 0.05,  insig = "blank", tl.cex=0.5, diag = FALSE, addrect = 5, method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1))

#clustered
m = corrplot::corrplot(res2$r[keep,keep], order = "hclust", p.mat = res2$P[keep,keep], sig.level = 0.01,  insig = "blank", tl.cex=0.6, diag = FALSE, addrect = 8, method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black")

#not clustered
m = corrplot::corrplot(res2$r[keep,keep], p.mat = res2$P[keep,keep], sig.level = 0.01,  insig = "blank", tl.cex=0.5, diag = FALSE, method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black")

#pdf(paste0("./pdfs/",taxonlevel,"_correlationtable.pdf"))
#orrplot(res2$r, order = "hclust", p.mat = res2$P, sig.level = 0.05, insig = "blank", tl.cex=0.8)
#dev.off()
```

##formal plotting of environmental variables
```{r env plotting}

ggplot(scores_single_complete, aes(x=date.month, y=scale(Mean_TemperatureC), color=date.month)) + stat_smooth(aes(group=1, color = "red")) + 
  stat_smooth(aes(y=scale(as.numeric(X_Mean_Humidity)), color = "blue")) +
  stat_smooth(aes(y=scale(particle), color = "green")) +
  stat_smooth(aes(y=scale(duration), color = "yellow")) +
  theme_pubr(base_size = 16) + labs(x = "Month", y = "Scaled Values", color = "Category") + 
  scale_x_discrete(limits=c(1:12), expand=c(0,0)) + 
  scale_color_d3(labels = c("Humidity", "Particle", "Temperature", "Duration"))

```


#PCA plot by seasons for small number of samples
```{r PCA by seasons etc}
#define function for plotting
pcasnapshot = function(ok, text.size=4, basesize =14){
  colfunc<-colorRampPalette(c("red","yellow","springgreen","royalblue"))
  #print(str(x))
  #print(ok$location)
 # print(ok[, "geo2"])
  #print(class(ok))
  #this is only for testing
  ok = scores.snap
  #text.size = 2
  #ok = scores.snap
  
  ok = data.frame(ok)
  master.cpm.snap = master.cpm.single[, ok$samplenames]
  pcares = dudi.pca(t(asinh(master.cpm.snap)), scale=FALSE,scannf = FALSE, nf = 3)
  .e <- environment()
  
  scree = fviz_screeplot(pcares, addlabels = TRUE )
  print(scree)
   ind.plot.clear = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$location,  pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1]))  + theme_pubr(margin = FALSE, base_size = basesize) +scale_fill_d3(palette = "category20") +  labs(fill = "Location", color = "Location")
  ind.plot.clear$plot_env = .e
  print(ind.plot.clear)
  
     ind.plot.clear.add = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$location,  pointsize=5, pointshape=21, alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1]))  + theme_pubr(margin = FALSE, base_size = basesize) +        scale_fill_d3(palette = "category20")+ 
       scale_color_d3(palette = "category20")+
       labs(fill = "Location", color = "Location")
  ind.plot.clear.add$plot_env = .e
  print(ind.plot.clear.add)
  
  #special plot for paper
  special.color = c("#4DBBD5FF", "#E64B35FF", "#3C5488FF", "#4DBBD5FF","#3C5488FF", "#3C5488FF", "#E64B35FF", 
                    "#4DBBD5FF",  "#E64B35FF","#4DBBD5FF", "#3C5488FF", "#3C5488FF", "#E64B35FF", "#4DBBD5FF",
                    "#E64B35FF", "#3C5488FF", "#3C5488FF" )
  in.plot.clear.path = c()
  if(FALSE){
  basesize=14
  text.size=5
   ind.plot.clear.path = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$location,  pointsize=5, pointshape=21, alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.75,invisible = "quali", ellipse.alpha=0.05)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1]))  + theme_pubr(margin = FALSE, base_size = basesize) +        scale_fill_d3(palette = "category20")+ 
       scale_color_d3(palette = "category20")+
       labs(fill = "Location", color = "Location") +
        geom_segment(aes(xend=c(tail(pcares$li$Axis1, n=-1), NA), 
                  yend=c(tail(pcares$li$Axis2, n=-1), NA)),
                  arrow=arrow(angle = 15,length=unit(0.5,"cm"), type="closed"),
                  linetype="longdash", size =1.2 ,alpha=0.7, color = special.color)
  ind.plot.clear.path$plot_env = .e
  print(ind.plot.clear.path)
  }
  
  ind.plot = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$geo2,  pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1]))  +  geom_text_repel(aes(label = ok$location), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) +scale_fill_d3(palette = "category20")+ labs(fill = "Location")
  ind.plot$plot_env = .e
  print(ind.plot)

ind.plot.2 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$geo2,  pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.6, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  geom_text(aes(label = ok$season), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) +scale_fill_d3(palette = "category20")+ labs(fill = "Geo")
  ind.plot.2$plot_env = .e
  print(ind.plot.2)
  
ind.plot.3 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$geo2,  pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  geom_text_repel(aes(label = ok$samplenames), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) +scale_fill_d3(palette = "category20")+ labs(fill = "Geo")
 ind.plot.3$plot_env = .e
  print(ind.plot.3)

ind.plot.4 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$location,  pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.6, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  geom_text_repel(aes(label = as.character(ok$date.start)), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) +scale_fill_d3(palette = "category20")+ labs(fill = "Geo")
 ind.plot.4$plot_env = .e
  print(ind.plot.4)  

ind.plot.5 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$season,  pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  geom_text(aes(label = as.character(ok$geo2)), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) + labs(fill = "Season")
 ind.plot.5$plot_env = .e
  print(ind.plot.5) 
  
ind.plot.6 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = as.character(ok$batch),  pointsize=4, pointshape=21,  alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  geom_text(aes(label = ok$batch), size=text.size) + 
  scale_fill_d3(palette = "category20")+
  theme_pubr(margin = FALSE, base_size = basesize) 
 ind.plot.6$plot_env = .e
  print(ind.plot.6)  

ind.plot.7 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = as.character(ok$location),  pointsize=4, pointshape=21,  alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +   
  scale_fill_d3(palette = "category20")+ 
  theme_pubr(margin = FALSE, base_size = basesize) +
  labs(fill = "Location")
 ind.plot.7$plot_env = .e
  print(ind.plot.7)    

ind.plot.8 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = as.character(ok$location),  pointsize=4, pointshape=21,  alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +   
  scale_fill_d3(palette = "category20")+  geom_text_repel(aes(label = ok$location), size=text.size) +
  theme_pubr(margin = FALSE, base_size = basesize) +
  labs(fill = "Location")
 ind.plot.8$plot_env = .e
  print(ind.plot.8)      
    
chem.special = c("Tricholoma", "Trametes", "Serpula","Psychrobacter", "Fomitopsis", "Pseudogymnoascus")
  names.features = c(head(rownames(pcares$c1)[order(pcares$c1$CS1)],10),tail(rownames(pcares$c1)[order(pcares$c1$CS1)],10),
               head(rownames(pcares$c1)[order(pcares$c1$CS2)],7),tail(rownames(pcares$c1)[order(pcares$c1$CS2)],7)
               )
names.features = unique(names.features)
gradient.color = c("#3C5488FF","#4DBBD5FF","#E64B35FF")
#varplots
  var = get_pca_var(pcares)
  #another way of picking names, based on contributions to axes
  var$contrib = var$contrib[order(var$contrib[,1]^2,var$contrib[,2]^2, var$contrib[,3]^2, decreasing = TRUE),]
  #names.features = row.names(var$contrib[1:50,])
  mm = corrplot::corrplot(var$contrib[1:50,], is.corr=FALSE, cl.align.text = "l")
  contrib = fviz_contrib(pcares, choice = "var", axes = 1:3, top = 100,ggtheme = theme_pubr(base_size = 8))
  var.plot = fviz_pca_var(pcares,  
             geom=c("text", "point"),
             select.var= list(name=c(names.features)), 
             col.var="contrib",
             labelsize = text.size+1,
             #gradient.cols = "npg",
             gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"),
             repel = TRUE) + theme_pubr(margin = FALSE, base_size = basesize+4)#plot features(loadings)  
   var.plot$plot_env = .e
  print(var.plot)
  
names.biplot = c(head(rownames(pcares$c1)[order(pcares$c1$CS1)],4),tail(rownames(pcares$c1)[order(pcares$c1$CS1)],8),
               head(rownames(pcares$c1)[order(pcares$c1$CS2)],3),tail(rownames(pcares$c1)[order(pcares$c1$CS2)],6)
               )
#biplot
  bi.plot = fviz_pca_biplot(pcares, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = ok$geo2, 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 5,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.3 ,
                alpha.ind = 0.7,
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                #addEllipses = TRUE,
                #ellipse.type = "euclid",
                #ellipse.level = 0.60,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Geo", color = "Contrib", alpha = "Contrib")  + 
    scale_fill_d3(palette = "category20") +
    coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  
 #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4)  + 
    #geom_text_repel(aes(label = as.character(ok$location)), size=4)  + 
 geom_segment(aes(xend=c(tail(pcares$li$Axis1, n=-1), NA), 
                  yend=c(tail(pcares$li$Axis2, n=-1), NA),
                  ),
                  arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"),
                  linetype=1, size=1, alpha=0.7, color = "steelblue")+
  theme_pubr(margin = FALSE, base_size = basesize) 
   bi.plot$plot_env = .e
  print(bi.plot)

  #biplot
  bi.plot2 = fviz_pca_biplot(pcares, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = ok$geo2, 
                #col.ind = ok$season,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 5,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.2 ,
                alpha.ind = 0.7,
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                #addEllipses = TRUE,
                #ellipse.type = "euclid",
                #ellipse.level = 0.60,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Geo", color = "Contrib", alpha = "Contrib")  + 
    scale_fill_d3(palette = "category20") +
    coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  
 #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4)  + 
  theme_pubr(margin = FALSE, base_size = basesize) 
   bi.plot2$plot_env = .e
  print(bi.plot2)
  
  bi.plot3 = fviz_pca_biplot(pcares, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = ok$location, 
                #col.ind = ok$season,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 5,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.2 , 
                alpha.ind = 0.7,
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                addEllipses = TRUE,
                #ellipse.type = "euclid",
                ellipse.level = 0.8,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Location", color = "Contrib", alpha = "Contrib")  + coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  scale_fill_d3(palette = "category20")+
 #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4)  + 
  theme_pubr(margin = FALSE, base_size = basesize) 
   bi.plot3$plot_env = .e
  print(bi.plot3)
  
#full spectrum arrows
colfunc<-colorRampPalette(c("red","yellow","green","royalblue"))
#sublt spectrum
#colfunc = colorRampPalette(rainbow_hcl(4))
#simplified arrows
#colfunc<-colorRampPalette(c("#E64B35FF","#00A087FF", "#3C5488FF"))
    
    bi.plot4 = fviz_pca_biplot(pcares, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = ok$location, 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 5,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.2 ,
                alpha.ind = 0.7,
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                #addEllipses = TRUE,
                #ellipse.type = "euclid",
                #ellipse.level = 0.60,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Geo", color = "Contrib", alpha = "Contrib")  + coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  scale_fill_d3(palette = "category20")+  
 #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4)  + 
    #geom_text_repel(aes(label = as.character(ok$location)), size=4)  + 
 geom_segment(aes(xend=c(tail(pcares$li$Axis1, n=-1), NA), 
                  yend=c(tail(pcares$li$Axis2, n=-1), NA),
                  ),
                  arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"),
                  linetype=1, size=1, alpha=0.5, color = colfunc(length(pcares$li$Axis1)))+
  theme_pubr(margin = FALSE, base_size = basesize) 
   bi.plot4$plot_env = .e
  print(bi.plot4)
  
  
plots = list(scree=scree, ind.plot.clear=ind.plot.clear, ind.plot.clear.add = ind.plot.clear.add, ind.plot.clear.path  = ind.plot.clear.path, ind.plot=ind.plot, ind.plot.2=ind.plot.2, ind.plot.3=ind.plot.3, ind.plot.4=ind.plot.4, ind.plot.5=ind.plot.5, ind.plot.6=ind.plot.6, ind.plot.7=ind.plot.7, ind.plot.8 = ind.plot.8, var.plot=var.plot, bi.plot=bi.plot, bi.plot2=bi.plot2, bi.plot3=bi.plot3, bi.plot4=bi.plot4, contrib=contrib)

l = mget(names(plots))
#ggsave("~/Documents/Bioinfo/DNAformal/arrange2x2.pdf", marrangeGrob(grobs = l, nrow=1, ncol=1), height = 8, width = 8)  
return(plots)
}

#this function is not good with too many locations
scores.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2016-06-01", "%Y-%m-%d" ) & date.start < as.Date("2016-08-01", "%Y-%m-%d" ))

scores.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2016-06-14", "%Y-%m-%d" ) & date.start < as.Date("2016-08-14", "%Y-%m-%d" ))

scores.testing.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2016-05-01", "%Y-%m-%d" ) & date.start < as.Date("2016-09-01", "%Y-%m-%d" ))

#purely testing
scores.snap = subset(scores_single_complete, geo2 %in% c("Asia", "Europe", "Northeast", "oceania", "Southeast", "Southwest"))
                     #== "Mike" & date.start >= as.Date("2015-12-01", "%Y-%m-%d" ) & date.start < as.Date("2016-05-01", "%Y-%m-%d" ))
#for mike
#2015-07-01 to 2015-08-01
#2016-02-01 to 2016-04-01
#2016-01-01 to 2017-01-01 !
#2016-06-01 to 2016-08-01 !
#2015-12-01 to 2016-05-01 !

plots = pcasnapshot(scores.snap, text.size=5)

plots.p1 = pcasnapshot(Patient1, text.size=5)

plots.chemi = pcasnapshot(scores_chemi, text.size=5)
```

 ##PCA for all samples2, plotting function for more samples
```{r PCA for all samples2}
pcaall = function(ok, text.size=5, base.size=14){
  #ok = scores.snap
  #text.size = 5
  #ok = scores_single_complete
  
  ok = data.frame(ok)
  master.cpm.snap = master.cpm.single[, ok$samplenames]
  pcares = dudi.pca(t(asinh(master.cpm.snap)), scale=FALSE,scannf = FALSE, nf = 3)
  .e <- environment()
  
  scree = fviz_screeplot(pcares, addlabels = TRUE )
  print(scree)
   ind.plot.geo = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$geo2,  pointsize=3, pointshape=21, palette="lancet", alpha.ind = 0.6, addEllipses =FALSE, ellipse.level = 0.68 ,invisible = "quali")  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1]))  + scale_fill_d3(palette = c("category20")) + theme_pubr(margin = FALSE, base_size = base.size+2)+
  labs(fill = "Geo", color = "Geo")
  ind.plot.geo$plot_env = .e
  print(ind.plot.geo)
  
#this part has bug when there is only one geographic location  
  ind.plot.geo.add = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$geo2,  pointsize=3, pointshape=21, palette="lancet", alpha.ind = 0.6, addEllipses =FALSE, ellipse.level = 0.68 ,invisible = "quali")  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1]))  + 
    scale_color_d3(palette = c("category20")) + 
    scale_fill_d3(palette = c("category20")) + 
    theme_pubr(margin = FALSE, base_size = base.size+2) +
    labs(fill = "Geo", color = "Geo")
  ind.plot.geo.add$plot_env = .e
  print(ind.plot.geo.add)
  
  
  ind.plot.season = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$season,  pointsize=3, pointshape=21, palette=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), alpha.ind = 0.6, addEllipses = FALSE, ellipse.level = 0.68,invisible = "quali")  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + theme_pubr(margin = FALSE, base_size = base.size+2) + scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))+
    labs(fill = "Season", color = "Season")
  ind.plot.season$plot_env = .e
  print(ind.plot.season)
  
    ind.plot.season.add = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$season,  pointsize=3, pointshape=21, palette=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), alpha.ind = 0.6, addEllipses = TRUE, ellipse.level = 0.68,invisible = "quali")  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + theme_pubr(margin = FALSE, base_size = base.size) + scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) +
    labs(fill = "Season", color = "Season")
  ind.plot.season.add$plot_env = .e
  print(ind.plot.season.add)
  
ind.plot.2 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = as.character(ok$batch),  pointsize=3, pointshape=21,  alpha.ind = 0.6, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",)  +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  geom_text(aes(label = ok$batch), size=text.size) + 
  scale_fill_d3(palette = "category20")+
  theme_pubr(margin = FALSE, base_size = base.size+2) +
    labs(fill = "Batch", color = "Batch")
 ind.plot.2$plot_env = .e
  print(ind.plot.2)  
    
chem.special = c("Tricholoma", "Trametes", "Serpula","Psychrobacter", "Fomitopsis", "Pseudogymnoascus")
  names.features = c(head(rownames(pcares$c1)[order(pcares$c1$CS1)],20),tail(rownames(pcares$c1)[order(pcares$c1$CS1)],20),
    head(rownames(pcares$c1)[order(pcares$c1$CS2)],7),tail(rownames(pcares$c1)[order(pcares$c1$CS2)],7)
               )
names.features = unique(names.features)

gradient.color = c("#3C5488FF","#4DBBD5FF","#E64B35FF")
#varplots
  var = get_pca_var(pcares)
  #another way of picking names, based on contributions to axes
  var$contrib = var$contrib[order(var$contrib[,1]^2,var$contrib[,2]^2, var$contrib[,3]^2, decreasing = TRUE),]
  #names.features = row.names(var$contrib[1:50,])
  mm = corrplot::corrplot(var$contrib[1:50,], is.corr=FALSE, cl.align.text = "l")
  contrib = fviz_contrib(pcares, choice = "var", axes = 1:3, top = 100,ggtheme = theme_pubr(base_size = 6))
  print(contrib)
  var.plot = fviz_pca_var(pcares,  
             geom=c("text","arrow"),
             select.var= list(name=c(names.features)), 
             col.var="contrib",
             labelsize = text.size,
             #gradient.cols = "npg",
             gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"),
             repel = TRUE)  + theme_pubr(margin = FALSE, base_size = base.size-4)#plot features(loadings)  
   var.plot$plot_env = .e
  print(var.plot)
   var.plot2 = fviz_pca_var(pcares,  
             geom=c("text"),
             select.var= list(name=c(names.features)), 
             col.var="contrib",
             labelsize = text.size,
             #gradient.cols = "npg",
             gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"),
             repel = TRUE)  + theme_pubr(margin = FALSE, base_size = base.size-4)#plot features(loadings)  
   var.plot2$plot_env = .e
  print(var.plot2)
  
  
names.biplot = c(head(rownames(pcares$c1)[order(pcares$c1$CS1)],4),tail(rownames(pcares$c1)[order(pcares$c1$CS1)],8),
               head(rownames(pcares$c1)[order(pcares$c1$CS2)],3),tail(rownames(pcares$c1)[order(pcares$c1$CS2)],6)
               )
#biplot
  bi.plot = fviz_pca_biplot(pcares, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = ok$geo2, 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 5,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.3 ,
                alpha.ind = 0.7,
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                #addEllipses = TRUE,
                #ellipse.type = "euclid",
                #ellipse.level = 0.60,
                invisible = "quali",
                #mean.point = FALSE
                )+
    scale_fill_d3(palette = c("category20")) + 
  labs(fill = "Geo", color = "Contrib", alpha = "Contrib")  + coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  
  theme_pubr(base_size = base.size-2) 
   bi.plot$plot_env = .e
  print(bi.plot)

  #biplot
  bi.plot2 = fviz_pca_biplot(pcares, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = ok$season, 
                #col.ind = ok$season,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 5,
                palette = c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"),
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.3 ,
                alpha.ind = 0.7,
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                #addEllipses = TRUE,
                #ellipse.type = "euclid",
                #ellipse.level = 0.60,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Season", color = "Contrib", alpha = "Contrib")  + coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) +  scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) +
 #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4)  + 
  theme_pubr(base_size = base.size) 
   bi.plot2$plot_env = .e
  print(bi.plot2)
  
  
plots = list(scree=scree, 
             ind.plot.geo=ind.plot.geo, 
             ind.plot.geo.add=ind.plot.geo.add,
             ind.plot.season = ind.plot.season,
             ind.plot.season.add = ind.plot.season.add,
             ind.plot.2=ind.plot.2,
             var.plot=var.plot, 
             var.plot2=var.plot2,
             contrib = contrib,
             bi.plot=bi.plot, 
             bi.plot2=bi.plot2)

l = mget(names(plots))
#ggsave("~/Documents/Bioinfo/DNAformal/arrange2x2.pdf", marrangeGrob(grobs = l, nrow=1, ncol=1), height = 8, width = 8)  
return(plots)
}


plots.all = pcaall(scores_single_complete, text.size=5, base.size = 14)

plots.mike = pcaall(Mike_single, text.size=4, base.size = 14)

#disable 2nd plot add.ellipse
plots.campushome = pcaall(campushome, text.size=4, base.size =14)

plots.noncampushome = pcaall(noncampushome, text.size=4, base.size =14)

#special interest
P2.select = subset(Patient1, location == "P2_work_home" | location == "Campus" | location == "Monterey-CA")
plots.p1 = pcaall(P2.select, text.size=5, base.size =14)

plots.p3 = pcaall(Patient3, text.size=5, base.size=14)

plots.scores.snap = pcaall(scores.snap, text.size=5, base.size =14)

plots.chemi2 = pcaall(scores_chemi, text.size=5, base.size=14)
```


##near - not near location profile comparison
```{r near-not near}

distsnapshot = function(profile, text.size=4){
  profile = data.frame(profile)
  master.cpm.snap = master.cpm.single[, profile$samplenames]
  bray.dist = vegdist(t(asinh(master.cpm.snap)), na.rm = TRUE, method= "bray")
  bray.dist.matrix = as.matrix(bray.dist)
  rownames(bray.dist.matrix) = profile$samplenames
  colnames(bray.dist.matrix) = profile$samplenames
  similar.dist = c()
  dissimilar.dist = c()
  #this is only for mike
  simterms = c("Campus-home", "Campus", "Hiking_weekend","Hiking _weekend","Mike_office-background", "Mike_home-background", "Mike_background" )
  for (i in 1:(nrow(profile)-1)){
    if (profile$location[i] %in% simterms & profile$location[i+1] %in% simterms | profile$location[i] == profile$location[i+1]){
    #if (profile$location[i] == profile$location[i+1]){
      similar.dist = c(similar.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[i+1]])
    } else if (profile$location[i] != profile$location[i+1]){
      dissimilar.dist = c(dissimilar.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[i+1]])
    } else { print(paste0(profile$location[i]," ", profile$location[i+1]))}
  }
  
  
  same.season.dist = c()
  different.season.dist = c()
  control = 0
  for (i in 1:(nrow(profile)-1)){
    for (j in (i+1):nrow(profile)){
      control = control + 1
      if (profile$season[i] == profile$season[j] ){
        same.season.dist = c(same.season.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
      } else if (profile$season[i] != profile$season[j] & abs(profile$date.start[i] - profile$date.start[j]) >= 14){
        different.season.dist = c(different.season.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
      }
    }
  }
  print(control)
  return(list(similar.dist = similar.dist, 
              dissimilar.dist = dissimilar.dist,     location.wilcox=wilcox.test(similar.dist, dissimilar.dist),
              same.season.dist = same.season.dist,
              different.season.dist = different.season.dist,
              season.wilcox = wilcox.test(same.season.dist, different.season.dist)))
}

scores.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-06-01", "%Y-%m-%d" ))
dist.result = distsnapshot(profile = scores.snap)    

#less time to get better results for seasonal?
#lets do it year by year 
scores.snap.one = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-08-01", "%Y-%m-%d" ) & date.start < as.Date("2015-08-01", "%Y-%m-%d" ))
dist.result = distsnapshot(profile = scores.snap.one)   

scores.snap.one = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2015-08-01", "%Y-%m-%d" ) & date.start < as.Date("2016-08-01", "%Y-%m-%d" ))
dist.result = distsnapshot(profile = scores.snap.one)   

#Patient 1
scores.snap.one = subset(scores_single_complete, ownership == "No1_1636-69-063")
dist.result = distsnapshot(profile = scores.snap.one)   


dist.location.df = data.frame(distance = as.numeric(c(dist.result$similar.dist, dist.result$dissimilar.dist)), group = c(rep("Same location", length(dist.result$similar.dist)), rep("Different locations", length(dist.result$dissimilar.dist))))

#this is statistically significant, but not very cool to look at
dist.season.df = data.frame(distance = as.numeric(c(dist.result$same.season.dist, dist.result$different.season.dist)), group = c(rep("Same season", length(dist.result$same.season.dist)), rep("Different season", length(dist.result$different.season.dist))))


ggdensity(dist.location.df, x = "distance",
   add = "median", 
   #add.params=list(color="red"),
   rug = TRUE,
   color = "group", 
   fill = "group",
   palette = c("#00AFBB", "#E7B800"),
   ) + labs(x="Bray-Curtis Distance") + theme_pubr(base_size = 16)
ggsave("~/Documents/Bioinfo/DNAformal/plots/location.pdf", width=6, height=4)                                                                                          

ggdensity(dist.season.df, x = "distance",
   add = "median", 
   #add.params=list(color="red"),
   rug = TRUE,
   color = "group", 
   fill = "group",
   palette = c("#00AFBB", "#E7B800"),
   ) + labs(x="Bray-Curtis Distance") + theme_pubr(base_size = 16)
ggsave("~/Documents/Bioinfo/DNAformal/plots/season.pdf", width=6, height=4)      


#ggplot(data.frame(distance = as.numeric(c(dist.result$similar.dist, dist.result$dissimilar.dist)), group = c(rep("Nearby locations", length(dist.result$similar.dist)), rep("Distant locations", length(dist.result$dissimilar.dist)))), aes(x=distance, color=group, fill=group)) + geom_density(alpha=0.4) + scale_color_npg()+ theme_pubr() 
 
    
```
##campushome vs noncampushome
```{r campushome vs non-campushome}
campushomesnapshot = function(profile, text.size=5, pick=1){
  #profile = scores.snap.mike
  profile = data.frame(profile)
  master.cpm.snap = master.cpm.single[, profile$samplenames]
  bray.dist = vegdist(t(asinh(master.cpm.snap)), na.rm = TRUE, method= "bray")
  bray.dist.matrix = as.matrix(bray.dist)
  rownames(bray.dist.matrix) = profile$samplenames
  colnames(bray.dist.matrix) = profile$samplenames
  campushome.dist = c()
  dissimilar.dist = c()
  noncampushome.dist = c()
  #this is only for mike
  simterms = c("Campus-home", "Campus", "Hiking_weekend","Hiking _weekend","Mike_office-background", "Mike_home-background", "Mike_background" )
  for (i in 1:(nrow(profile)-1)){
        for (j in (i+1):nrow(profile)){
          if ((profile$location[i] %in% simterms & profile$location[j] %in% simterms) | profile$location[i] == profile$location[j]){
    #if (profile$location[i] == profile$location[i+1]){
          campushome.dist = c(campushome.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
        } else if (profile$location[i] != profile$location[j] & (profile$location[i] %in% simterms | profile$location[j] %in% simterms)){
          dissimilar.dist = c(dissimilar.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
        } else {}
          if (!(profile$location[i] %in% simterms) & !(profile$location[j] %in% simterms) & profile$location[i] != profile$location[j]){
            noncampushome.dist = c(noncampushome.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
          }
        }
  }
  
  
  seasons = c("spring","summer", "fall", "winter")
  seasonpick = seasons[pick]
  same.season.dist = c()
  different.season.dist = c()
  control = 0
  for (i in 1:(nrow(profile)-1)){
    for (j in (i+1):nrow(profile)){
      control = control + 1
      #skipping fall because it sucks
      if (profile$season[i] == profile$season[j] & profile$season[i] == seasonpick ){
        same.season.dist = c(same.season.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
      } else if (profile$season[i] != profile$season[j] & abs(profile$date.start[i] - profile$date.start[j]) >= 0 & (profile$season[i] == seasonpick | profile$season[j] == seasonpick)){
        #print(paste0(profile$season[i],"_", profile$season[j]))
        different.season.dist = c(different.season.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
      }
    }
  }
  
  
  same.season.all.dist = c()
  different.season.all.dist = c()
  for (i in 1:(nrow(profile)-1)){
    for (j in (i+1):nrow(profile)){
      control = control + 1
      #skipping fall because it sucks
      if (profile$season[i] == profile$season[j]){
        same.season.all.dist = c(same.season.all.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
      } else if (profile$season[i] != profile$season[j] & abs(profile$date.start[i] - profile$date.start[j]) >= 14 ){
        #print(paste0(profile$season[i],"_", profile$season[j]))
        different.season.all.dist = c(different.season.all.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
      }
    }
  }
  
  
  #print(control)                                                                                                                                                                                                                                                                             
  return(list(campushome.dist = campushome.dist,
              noncampushome.dist = noncampushome.dist,
              dissimilar.dist = dissimilar.dist,     
              location.wilcox=wilcox.test(campushome.dist, dissimilar.dist),
              location2.wilcox = wilcox.test(campushome.dist, noncampushome.dist),
              location3.wilcox = wilcox.test(noncampushome.dist, dissimilar.dist),
              same.season.dist = same.season.dist,
              different.season.dist = different.season.dist,
              same.season.all.dist = same.season.all.dist,
              different.season.all.dist = different.season.all.dist,
              season.wilcox = wilcox.test(same.season.dist, different.season.dist),
              season.all.wilcox = wilcox.test(same.season.all.dist, different.season.all.dist)
              ))
}

#removes season functions, for smallter time-frames
campushomesimp = function(profile, text.size=5, pick=1){
  #profile = scores.snap.mike
  profile = data.frame(profile)
  master.cpm.snap = master.cpm.single[, profile$samplenames]
  bray.dist = vegdist(t(asinh(master.cpm.snap)), na.rm = TRUE, method= "bray")
  bray.dist.matrix = as.matrix(bray.dist)
  rownames(bray.dist.matrix) = profile$samplenames
  colnames(bray.dist.matrix) = profile$samplenames
  campushome.dist = c()
  dissimilar.dist = c()
  noncampushome.dist = c()
  #this is only for mike
  simterms = c("Campus-home", "Campus", "Hiking_weekend","Hiking _weekend","Mike_office-background", "Mike_home-background", "Mike_background" )
  for (i in 1:(nrow(profile)-1)){
        for (j in (i+1):nrow(profile)){
          if ((profile$location[i] %in% simterms & profile$location[j] %in% simterms) | profile$location[i] == profile$location[j]){
    #if (profile$location[i] == profile$location[i+1]){
          campushome.dist = c(campushome.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
        } else if (profile$location[i] != profile$location[j] & (profile$location[i] %in% simterms | profile$location[j] %in% simterms)){
          dissimilar.dist = c(dissimilar.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
        } else {}
          if (!(profile$location[i] %in% simterms) & !(profile$location[j] %in% simterms) & profile$location[i] != profile$location[j]){
            noncampushome.dist = c(noncampushome.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
          }
        }
  }
  
  return(list(campushome.dist = campushome.dist,
              noncampushome.dist = noncampushome.dist,
              dissimilar.dist = dissimilar.dist,     
              location.wilcox=wilcox.test(campushome.dist, dissimilar.dist),
              location2.wilcox = wilcox.test(campushome.dist, noncampushome.dist),
              location3.wilcox = wilcox.test(noncampushome.dist, dissimilar.dist)
              ))
}

#all time
scores.snap.mike = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-06-01", "%Y-%m-%d" ))
dist.result = campushomesnapshot(profile = scores.snap.mike, pick = 3)    

#less time to get better results for seasonal?
#lets do it year by year, it seems 2nd year's data is better
scores.snap.one = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-08-01", "%Y-%m-%d" ) & date.start < as.Date("2015-08-01", "%Y-%m-%d" ))
dist.result = campushomesnapshot(profile = scores.snap.one) 

scores.snap.one = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2015-08-01", "%Y-%m-%d" ) & date.start < as.Date("2016-08-01", "%Y-%m-%d" ))
dist.result = campushomesnapshot(profile = scores.snap.one, pick=4)   
####

#cherry picking time.frame
scores.snap.one = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2016-06-01", "%Y-%m-%d" ) & date.start < as.Date("2016-08-01", "%Y-%m-%d" ))
dist.result = campushomesimp(profile = scores.snap.one)   

#Patient 1
scores.snap.one = subset(scores_single_complete, ownership == "No1_1636-69-063")
dist.result = campushomesnapshot(profile = scores.snap.one[,], pick=4)

#purely testing, only campushome, trying to show seasonal effects
scores.snap.campushome = subset(campushome, ownership == "Mike" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-02-01", "%Y-%m-%d" ))

#distances based on four seasons
fourseasons = function(profile){
  #profile = scores.snap.campushome
  profile = scores_chemi
  seasons = factor(unique(profile$season), levels = c("spring", "summer", "fall", "winter"))
dist.objects = list()
  if ("spring" %in% profile$season){
  dist.result.spring = campushomesnapshot(profile = profile,pick=1)
  dist.objects[["spring"]] = dist.result.spring
  }
  if ("summer" %in% profile$season){  
  dist.result.summer = campushomesnapshot(profile = profile,pick=2)
  dist.objects[["summer"]] = dist.result.summer       
  }
  if ("fall" %in% profile$season){
  dist.result.fall = campushomesnapshot(profile = profile,pick=3)
  dist.objects[["fall"]] = dist.result.fall
  }
  if ("winter" %in% profile$season){
    dist.result.winter = campushomesnapshot(profile = profile,pick=4)
    dist.objects[["winter"]] = dist.result.winter
  }
#dist.objects = list(spring = dist.result.spring, summer = dist.result.summer, fall = dist.result.fall, winter = dist.result.winter)
#dist.objects = list(spring = dist.result.spring, fall = dist.result.fall, winter = dist.result.winter) #for Patient1


#seasons = c("spring",  "fall", "winter") #for Patient1
distance = c()
group = c()
season = c()
dist.season.df = data.frame(distance = as.numeric(), group = as.character(), season = as.character())
for ( i in 1:length(seasons)){
  #print(length(object$same.season.dist))
  #print(length(object$different.season.dist))
  print(seasons[i])
  print(dist.objects[[seasons[i]]]$season.wilcox)
  dist.season.df = rbind.data.frame(dist.season.df, 
                                    data.frame(distance = as.numeric(c(dist.objects[[get(seasons[i])]]$same.season.dist, dist.objects[[get(seasons[i])]]$different.season.dist)), 
                                    group = c(rep("Same season", length(dist.objects[[get(seasons[i])]]$same.season.dist)), rep("Different season", length(dist.objects[[get(seasons[i])]]$different.season.dist))), 
                                    season = rep(seasons[i], length(dist.objects[[get(seasons[i])]]$same.season.dist)+length(dist.objects[[get(seasons[i])]]$different.season.dist))))
}
dist.season.df$group = factor(dist.season.df$group, levels = c("Same season", "Different season"))
dist.season.df$season = factor(dist.season.df$season, levels = c("spring","summer", "fall", "winter"))
return(dist.season.df)
}
dist.fourseasons.df = fourseasons(scores.snap.campushome)
dist.fourseasons.df = fourseasons(Patient1[-30,])
dist.fourseasons.df = fourseasons(scores.snap.mike)
dist.fourseasons.df = fourseasons(scores_single_complete)
dist.fourseasons.df = fourseasons(scores_chemi)

ggplot(dist.fourseasons.df, aes(x=season, y=distance)) + geom_boxplot(aes(fill=group), position=position_dodge(0.6), width=0.5) +
  #geom_beeswarm(aes(group=group), dodge.width=0.5,  size=0.1, alpha=0.5)+
  scale_fill_manual(values=c("#00AFBB", "#E7B800", "#3C5488FF")) +
  #scale_color_manual(values=c("#00AFBB", "#E7B800", "#3C5488FF")) +
  labs(y="Bray-Curtis distance") +
  theme_pubr(base_size = 16, legend = "bottom") 


dist.location.df = data.frame(distance = as.numeric(c(dist.result$campushome.dist, dist.result$dissimilar.dist)), group = c(rep("Within Campus", length(dist.result$campushome.dist)), rep("Campus vs Others", length(dist.result$dissimilar.dist))))

dist.location.df = data.frame(distance = as.numeric(c(dist.result$campushome.dist, dist.result$dissimilar.dist, dist.result$noncampushome.dist)), group = c(rep("Within Campus", length(dist.result$campushome.dist)), rep("Campus vs others", length(dist.result$dissimilar.dist)), rep("Within others", length(dist.result$noncampushome.dist))))

#this is statistically significant, but not very cool to look at as density plot, boxplot + beeswarm is probably a better choice
dist.season.df = data.frame(distance = as.numeric(c(dist.result$same.season.dist, dist.result$different.season.dist)), group = c(rep("Same season", length(dist.result$same.season.dist)), rep("Different season", length(dist.result$different.season.dist))))
dist.season.df$group = factor(dist.season.df$group, levels=c("Same season", "Different season"))


ggdensity(dist.location.df, x = "distance",
   add = "median", 
   #add.params=list(color="red"),
   rug = TRUE,
   color = "group", 
   fill = "group",
   palette = c("#00AFBB", "#E7B800", "#3C5488FF"),
   ) + labs(x="Bray-Curtis Distance") + theme_pubr(base_size = 16) 
ggsave("~/Documents/Bioinfo/DNAformal/plots/location.pdf", width=7, height=5)                                                                                          

ggdensity(dist.season.df, x = "distance",
   add = "median", 
   #add.params=list(color="red"),
   rug = TRUE,
   color = "group", 
   fill = "group",
   palette = c("#00AFBB", "#E7B800"),
   ) + labs(x="Bray-Curtis Distance") + theme_pubr(base_size = 16)
ggsave("~/Documents/Bioinfo/DNAformal/plots/season.pdf", width=6, height=4)      

#testing

ggplot(dist.season.df, aes(x=group, y=distance, color=group, fill=group)) + geom_boxplot(alpha=0.4) + 
  #geom_beeswarm(alpha=0.3, size=0.02) + 
  theme_pubr() + scale_color_npg()

```
##P1 per genus campus vs non-campus comparisons
```{r genus campus vs non-campus}
master.asinh.cpm.single.fifty = master.asinh.cpm.single[rowSums(master.asinh.cpm.single > 0) >= 50,]
#limited to p1
genus.campusornot = cbind.data.frame(t(master.asinh.cpm.single.fifty[, Mike_single$samplenames]), campus_ornot = Mike_single$campus_ornot)

genus.campusornot.melt = melt(genus.campusornot, id.vars = "campus_ornot")
genus.campusornot.result = compare_means( formula = value ~ campus_ornot, data = genus.campusornot.melt,  group.by = "variable", method = "wilcox.test", p.adjust.method = "fdr")


genus.campusornot.sig = genus.campusornot[, c(genus.campusornot.result$p.adj < 0.05, TRUE)]
genus.campusornot.result.sig = genus.campusornot.result[genus.campusornot.result$p.adj < 0.05,]
genus.campusornot.result.sig$variable = as.character(genus.campusornot.result.sig$variable)
genus.campusornot.result.sig = genus.campusornot.result.sig[order(genus.campusornot.result.sig$p.adj),]
#rank names by inverse order of p.adjust, first one is the smallest
genus.select = as.character(genus.campusornot.result.sig$variable)
genus.special.interests = c("Ophiognomonia","Cladosporium", "Preussia", "Penicillium", "Aspergillus", "Gymnopus", "Betula", "Triticum", "Canis", "Felis", "Streptococcus", "Corynebacterium", "Staphylococcus", "Geobacillus", "Segetibacter")

genus.campusornot.sig.melt = melt(genus.campusornot.sig[, as.character(c(genus.select, "campus_ornot"))], id.vars = "campus_ornot")

#pick and plot
genus.campusornot.sig.melt = melt(genus.campusornot.sig[,as.character(c(genus.special.interests, "campus_ornot"))], id.vars = "campus_ornot")


#formal plots
ggplot(genus.campusornot.sig.melt, aes(x = campus_ornot , y = value, color=campus_ornot, fill=campus_ornot)) + 
  geom_boxplot(size=0.6, alpha=0.5)+ 
  geom_quasirandom(size=0.6, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ 
  #geom_beeswarm(size=0.6, alpha=0.3, color = "black", fill="black", cex=2, width=0.1)+ #beeswarm has issues with 0s
  stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) +
  theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ scale_color_manual(values = rev(c("#00AFBB", "#E7B800"))) +
  scale_fill_manual(values = rev(c("#00AFBB", "#E7B800")))+
  theme(strip.background = element_blank()) +
  rremove("x.text") +
  labs(color = "Location", fill="Location") +
  facet_wrap(~variable, scales="free_y", ncol=5)
   #expa

chisq.test(x=rbind(c(73/2,73/2),c(57,16)))  #fungi phyla distribution test


#plotting all p values
genus.campusornot.df = genus.campusornot.sig[, as.character(c(genus.select, "campus_ornot"))]    
mediandiff= apply(genus.campusornot.df[, 1:100], 2, function(x){mean(x[genus.campusornot.df$campus_ornot == "Campus"]) - mean(x[genus.campusornot.df$campus_ornot != "Campus"])}) #mean difference, to know which one is higher
genus.campusornot.result.sig$variable = factor(genus.campusornot.result.sig$variable, levels = c(genus.campusornot.result.sig$variable))

ggplot(cbind.data.frame(genus.campusornot.result.sig, mdiff = mediandiff), aes(x = variable, y=p.adj, color=mdiff)) + geom_point() + theme_pubr(base_size=16, x.text.angle = 60) + theme(axis.text.x = element_text(size = 6)) + scale_color_gradient2(low = muted("red"), mid = "white",
  high = muted("blue"), midpoint = 0, space = "Lab",
  na.value = "grey50", guide = "colourbar")
                                
```

##P2 and P3 per genus home vs non-home comparisons
```{r genus home vs non-home}
master.asinh.cpm.single.fifty = master.asinh.cpm.single[rowSums(master.asinh.cpm.single > 0) >= 50,]
#limited to p2, no significant stuff were detected
Patient1.meta = Patient1
Patient1.meta$home = ifelse(Patient1$location == "P2_work_home", "home", "non-home")
genus.homeornot = cbind.data.frame(t(master.asinh.cpm.single.fifty[, Patient1.meta$samplenames]), home_ornot = Patient1.meta$home)

genus.homeornot.melt = melt(genus.campusornot, id.vars = "home_ornot")
genus.homeornot.result = compare_means( formula = value ~ home_ornot, data = genus.homeornot.melt,  group.by = "variable", method = "wilcox.test", p.adjust.method = "fdr")


genus.homeornot.sig = genus.homeornot[, c(genus.homeornot.result$p.adj < 0.05, TRUE)]
genus.homeornot.result.sig = genus.homeornot.result[genus.homeornot.result$p.adj < 0.05,]
genus.homeornot.result.sig$variable = as.character(genus.homeornot.result.sig$variable)
genus.homeornot.result.sig = genus.homeornot.result.sig[order(genus.homeornot.result.sig$p.adj),]
#rank names by inverse order of p.adjust, first one is the smallest
genus.select = as.character(genus.homeornot.result.sig$variable)
genus.special.interests = c("Ophiognomonia","Cladosporium", "Preussia", "Penicillium", "Aspergillus", "Gymnopus", "Betula", "Triticum", "Canis", "Felis", "Streptococcus", "Corynebacterium", "Staphylococcus", "Geobacillus", "Segetibacter")

genus.homeornot.sig.melt = melt(genus.homeornot.sig[, as.character(c(genus.select, "home_ornot"))], id.vars = "home_ornot")

#pick and plot
genus.homeornot.sig.melt = melt(genus.homeornot.sig[,as.character(c(genus.special.interests, "home_ornot"))], id.vars = "home_ornot")


#formal plots
ggplot(genus.homeornot.melt, aes(x = home_ornot , y = value, color=home_ornot, fill=home_ornot)) + 
  geom_boxplot(size=0.6, alpha=0.5)+ 
  geom_quasirandom(size=0.6, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ 
  #geom_beeswarm(size=0.6, alpha=0.3, color = "black", fill="black", cex=2, width=0.1)+ #beeswarm has issues with 0s
  stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) +
  theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ scale_color_manual(values = rev(c("#00AFBB", "#E7B800"))) +
  scale_fill_manual(values = rev(c("#00AFBB", "#E7B800")))+
  theme(strip.background = element_blank()) +
  rremove("x.text") +
  labs(color = "Location", fill="Location") +
  facet_wrap(~variable, scales="free_y", ncol=5)
   #expa

ggplot(genus.homeornot, aes(x = home_ornot , y = Raoultella, color=home_ornot, fill=home_ornot, label = Patient1.meta$location)) + 
  geom_boxplot(size=0.6, alpha=0.5)+ 
  geom_quasirandom(size=0.6, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ 
  geom_text_repel()+
  #geom_beeswarm(size=0.6, alpha=0.3, color = "black", fill="black", cex=2, width=0.1)+ #beeswarm has issues with 0s
  stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) +
  theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ scale_color_manual(values = rev(c("#00AFBB", "#E7B800"))) +
  scale_fill_manual(values = rev(c("#00AFBB", "#E7B800")))+
  theme(strip.background = element_blank()) +
  rremove("x.text") +
  labs(color = "Location", fill="Location")


#plotting all p values
genus.campusornot.df = genus.campusornot.sig[, as.character(c(genus.select, "campus_ornot"))]    
mediandiff= apply(genus.campusornot.df[, 1:100], 2, function(x){mean(x[genus.campusornot.df$campus_ornot == "Campus"]) - mean(x[genus.campusornot.df$campus_ornot != "Campus"])}) #mean difference, to know which one is higher
genus.campusornot.result.sig$variable = factor(genus.campusornot.result.sig$variable, levels = c(genus.campusornot.result.sig$variable))

ggplot(cbind.data.frame(genus.campusornot.result.sig, mdiff = mediandiff), aes(x = variable, y=p.adj, color=mdiff)) + geom_point() + theme_pubr(base_size=16, x.text.angle = 60) + theme(axis.text.x = element_text(size = 6)) + scale_color_gradient2(low = muted("red"), mid = "white",
  high = muted("blue"), midpoint = 0, space = "Lab",
  na.value = "grey50", guide = "colourbar")
                                
```

#Four people focus========================
```{r four people PCA plotting}
re = dudi.pca(t(four.master.asinh.cpm[-1, fourpeople.select$samplenames]), scale=FALSE, scannf = FALSE, nf=3)
fviz_screeplot(re, addlabels = TRUE )
get_eig(re) #eigenvalues 
#flipping y if necessary
re$li$Axis2 = -re$li$Axis2
re$c1$CS2 = -re$c1$CS2
re$co$Comp2 = -re$co$Comp2
re$l1$RS2 = -re$l1$RS2

gravity.df = cbind(re$li, fourpeople.select$ownership)
fourpeople.select$ownership = mapvalues(fourpeople.select$ownership, c("Gw" , "No3_San Mateo", "Guan_SF", "Mike"), c("P5", "P3", "P6", "P1"))
fourpeople.select$ownership = factor(fourpeople.select$ownership, levels = c("P1", "P3", "P5", "P6"))


fviz_pca_ind(re, label="none", fill.ind = fourpeople.select$ownership,  pointsize=3, pointshape=21, palette=c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"), alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.58) + theme(legend.position = "bottom")+ coord_fixed(sqrt(re$eig[2] / re$eig[1])) + theme_pubr(margin = FALSE, base_size = 16)  + geom_text_repel(aes(label = fourpeople.select$batch))

fviz_pca_ind(re, label="none", axes = c(1,3), fill.ind = fourpeople.select$ownership,  pointsize=3, pointshape=21, palette=c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"), alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.58) + theme(legend.position = "bottom")+ coord_fixed(sqrt(re$eig[2] / re$eig[1])) + theme_pubr(margin = FALSE, base_size = 16) #+ geom_text_repel(aes(label = fourpeople.select$location))


fviz_pca_ind(re, label="none", fill.ind = fourpeople.select$ownership,  pointsize=3, pointshape=21, palette=c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"), alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.58) + theme(legend.position = "bottom")+ coord_fixed(sqrt(re$eig[2] / re$eig[1])) + theme_pubr(margin = FALSE, base_size = 16) + geom_text_repel(aes(label = fourpeople.select$location), size =4)

fviz_pca_ind(re, label="none", fill.ind = fourpeople.select$ownership,  pointsize=3, pointshape=21, palette=c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"), alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.58) + theme(legend.position = "bottom")+ coord_fixed(sqrt(re$eig[2] / re$eig[1])) + labs(fill = "Person") + theme_pubr(margin = FALSE, base_size = 16) #+ geom_text_repel(aes(label = fourpeople.select$location))


#loadings plot
var <- get_pca_var(re)
ind = get_pca_ind(re)
head(var$contrib,15) #contributions
#select top 15 contributions from axis1 and axis2
names = c(head(rownames(var$contrib)[order(var$contrib[,1], decreasing = TRUE)],15), head(rownames(var$contrib)[order(var$contrib[,2], decreasing = TRUE)],7),head(rownames(var$contrib)[order(var$contrib[,3], decreasing = TRUE)],7))

names =unique(names)
#equivalent to contribution of variables
names = c(head(rownames(re$c1)[order(re$c1$CS1^2, decreasing = TRUE)],10), head(rownames(re$c1)[order(re$c1$CS2^2, decreasing = TRUE)],10), head(rownames(re$c1)[order(re$c1$CS3^2, decreasing = TRUE)],10))
names = unique(names)

#this shows the variable plot better than based on other metrics
names = c(head(rownames(re$c1)[order(re$c1$CS1)],8),tail(rownames(re$c1)[order(re$c1$CS1)],8),
          head(rownames(re$c1)[order(re$c1$CS2)],4),tail(rownames(re$c1)[order(re$c1$CS2)],4))
names = unique(names)
#select.var = list(contrib=30) will choose the top 30 contributing varibles automatically, it is however unclear how it was calcuated
fviz_pca_var(re, select.var= list(name=names), geom=c("point", "text"),col.var="contrib",
             gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"),
             repel = TRUE, labelsize = 5) + 
            #coord_fixed(sqrt(re$eig[2] / re$eig[1])) + 
            theme_pubr(base_size = 16, legend = "top") #plot features(loadings)


fviz_pca_var(re, select.var= list(name=names), geom=c("point","arrow", "text"), col.var="contrib",
             gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"),
             repel = TRUE, axes=c(1,3)) #plot features(loadings)

#biplot prep
#biplot.names = c(head(rownames(var$contrib)[order(var$contrib[,1], decreasing = TRUE)],10), #head(rownames(var$contrib)[order(var$contrib[,2], decreasing = TRUE)],10))
#biplot.names = unique(biplot.names)

#same as above
biplot.names = c(head(rownames(re$c1)[order(re$c1$CS1)],5),tail(rownames(re$c1)[order(re$c1$CS1)],5),
               head(rownames(re$c1)[order(re$c1$CS2)],3),tail(rownames(re$c1)[order(re$c1$CS2)],3),
               head(rownames(re$c1)[order(re$c1$CS3)],2),tail(rownames(re$c1)[order(re$c1$CS3)],2)
               )


###special plot for locations
fviz_pca_biplot(re, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = fourpeople.select$location, 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = names),
                pointshape = 21, pointsize = 3,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.2 , 
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"),
                label.size = 4,
                addEllipses = TRUE,
                ellipse.type = "euclid",
                ellipse.level = 0.60,
                invisible = "quali",
                )+
  labs(fill = "Locations", color = "Contribution", alpha = "Contribution")  + theme(legend.position = "bottom")# Change legend title

#+ scale_shape_manual(values=c(18,17,16,15)) 


#in development
fviz_pca_biplot(re, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = fourpeople.select$ownership, 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = names),
                pointshape = 21, pointsize = 3,
                palette = c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"),
                axes = c(1,3),
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.2 , 
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"),
                label.size = 4,
                #addEllipses = TRUE,
                ellipse.type = "euclid",
                ellipse.level = 0.60
                #invisible = "quali"
                )+
  labs(fill = "Person", color = "Contribution", alpha = "Contribution")  + theme_pubr(base_size = 14, margin = FALSE) # Change legend title


###other plots concerning these individuals, using the habillage arugment would cause issues for gradient colors
fviz_pca_biplot(re, geom.ind =c("point"), geom.var=c("point", "text"), select.var= list(name = names), alpha.var ="contrib", col.var = "contrib", fill.ind = fourpeople.select$ownership, repel=TRUE, palette="npg",pointsize=2, pointshape=21, gradient.cols = "RdBu") + theme(legend.position = "bottom") #+ scale_shape_manual(values=c(18,17,16,15)) 

fviz_pca_biplot(re, label = "var", select.var= list(name = names),alpha.var ="contrib", habillage = fourpeople.select$ownership, repel=TRUE, palette="npg", addEllipses =TRUE, ellipse.level = 0.68) + scale_shape_manual(values=c(18,17,16,15)) + theme(legend.position = "bottom") + theme(legend.position = "bottom")

fviz_pca_biplot(re, axes=c(1,3),label = "var", select.var= list(name = names), alpha.var ="contrib", habillage = fourpeople.select$ownership, repel=TRUE, palette="npg", pointsize = 2) + scale_shape_manual(values=c(18,17,16,15)) + theme(legend.position = "bottom")

fviz_pca_biplot(re, axes=c(1,3),label = "var", select.var= list(name = names), alpha.var ="contrib", habillage = fourpeople.select$ownership, repel=TRUE, palette="npg", addEllipses =TRUE, ellipse.level = 0.68) + scale_shape_manual(values=c(18,17,16,15)) + theme(legend.position = "bottom")


#fviz_pca_ind(re, label="none", pointsize=2.5, palette="npg", alpha.ind = 0.5, ) +  scale_shape_manual(values=c(rep(19,5)))

fviz_contrib(re, choice = "var", axes=1,top=20)
fviz_contrib(re, choice = "var", axes=2, top=20)
fviz_contrib(re, choice = "var", axes=3, top=20)
#top contributing variables for each PC
head(rownames(re$c1)[order(re$c1$CS1^2, decreasing = TRUE)],20)
head(rownames(re$c1)[order(re$c1$CS2^2, decreasing = TRUE)],20)
head(rownames(re$c1)[order(re$c1$CS3^2, decreasing = TRUE)],20)


 #this is for four people heatmap or barplot
names2 = c(head(rownames(re$c1)[order(re$c1$CS1^2, decreasing = TRUE)],25), head(rownames(re$c1)[order(re$c1$CS2^2, decreasing = TRUE)],10), head(rownames(re$c1)[order(re$c1$CS3^2, decreasing = TRUE)],5))
names = unique(names)


```
##Four people c-means
```{r fourpeople c-means clustering}
#excluding Mike as he is too divergent
three.select.master.asinh.cpm = master.asinh.cpm.single[, fourpeople.select$samplenames[fourpeople.select$aownership!="P1"]]
threepeople.select = fourpeople.select[fourpeople.select$aownership!="P1",]                                                             

threepeople_complete = cbind.data.frame(t(three.select.master.asinh.cpm[rowSums(three.select.master.asinh.cpm)>0,]), owner = threepeople.select$aownership)
threepeople.owner = aggregate(. ~ owner, threepeople_complete, median)
order= threepeople.owner$owner
threepeople.owner = threepeople.owner[, -1]

threepeople.owner = threepeople.owner[, colSums(threepeople.owner)>0]
weight = c(apply(threepeople.owner, 2, var))
threepeople.owner = threepeople.owner[, weight>=0.2] #variance filter
threepeople.scale = scale(threepeople.owner)
rownames(threepeople.scale) = c(order)
wtf = t(threepeople.scale)

#p.test
threepeople.melt = melt(threepeople_complete, id.vars = "owner")
p.result = compare_means( formula = value ~ owner, data = threepeople.melt,  group.by = "variable", method = "kruskal.test", p.adjust.method = "fdr")


#determining the number of clusters
# Elbow method
fviz_nbclust(wtf, kmeans, method = "wss") +
    geom_vline(xintercept = 4, linetype = 2)+ #this is hardcoded
  labs(subtitle = "Elbow method")
# Silhouette method
fviz_nbclust(wtf, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")
# Gap statistic
# nboot = 50 to keep the function speedy. 
# recommended value: nboot= 500 for your analysis.
# Use verbose = FALSE to hide computing progression.
set.seed(123)
fviz_nbclust(wtf, kmeans, nstart = 25,  method = "gap_stat", nboot = 100)+
  labs(subtitle = "Gap statistic method")

cm = cmeans(wtf, center=8, iter.max=200)
table(cm$cluster)
cm$cluster = factor(cm$cluster, levels=c(1:length(unique(cm$cluster))))
fviz_cluster(list(data = wtf, cluster=cm$cluster), 
             geom = c("text"),
             ellipse = TRUE,
             ellipse.alpha = 0.3, #used to be 0.6 if only points are plotted.
             ellipse.type = "norm",
             ellipse.level = 0.68,
             palette = "npg",
             repel = TRUE) + theme_pubr(base_size = 16)

tempp = cbind.data.frame(wtf, cluster=cm$cluster, membership=apply(cm$membership, 1, max), taxon = row.names(wtf))
df.season.scale.group = tempp[tempp$membership>0.7,]
table(df.season.scale.group$cluster)
length(df.season.scale.group$cluster)
df.season.melt = melt(df.season.scale.group, id.vars = c("cluster","membership", "taxon"))

df.season.melt$variable = factor(df.season.melt$variable)
df.season.melt$cluster = factor(df.season.melt$cluster)

#+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))
#green, blue, red, dark blue
ggplot(data=df.season.melt, aes(x=variable, y=value, group=taxon, color=cluster, alpha=membership)) +
  #geom_line(stat="smooth", method="loess", alpha=0.7)+
  geom_line(size=1.5)+
  geom_hline(yintercept = 0, linetype = 2)+
  theme_pubr(base_size = 16, x.text.angle = 45) + labs(y="Scaled Values") + guides(alpha=FALSE, size=FALSE) + scale_color_npg()+ 
  facet_wrap(~cluster, scales="free_y", ncol=4)

```

##Four people distance calculation
```{r fourpeople distance}
#reference dataframe
fourpeople.check = subset(fourpeople.select, select=c(ownership, samplenames, date.start, date.end, Filter_No))

#evaluating variance level for each owner
res2 = rcorr(fourpeople.select.master.asinh.cpm[-1,], type="pearson") 
m = corrplot::corrplot(res2$r, order = "hclust", hclust.method = "average", p.mat = res2$P, sig.level = 0.005, insig = "blank", tl.cex=0.6, type="lower", method="pie")

four.bray.dist = vegdist(t(fourpeople.select.master.asinh.cpm[-1, ]), method="bray") #can use as.matrix to access
#in order to change label names for an dist object, one needs to convert it to matrix
four.bray.dist.matrix = as.matrix(four.bray.dist)
rownames(four.bray.dist.matrix) = fourpeople.check$ownership
colnames(four.bray.dist.matrix) = fourpeople.check$ownership
four.bray.dist.new = as.dist(four.bray.dist.matrix)
fviz_dist(four.bray.dist.new)
plot(hclust(four.bray.dist.new, method = "ward.D"), xlab="Samples")

library(qgraph)
dist_mi <- 1/four.bray.dist.new # one over, as qgraph takes similarity matrices as input
#jpeg('example_forcedraw.jpg', width=1000, height=1000, unit='px')
qgraph(dist_mi, layout='spring', esize=3, trans=0.5, alpha=0.05)

#output tables for gephi
nodes.table = data.frame(Id = rownames(four.bray.dist.matrix), Label=fourpeople.check$ownership, Class=fourpeople.check$ownership, Misc=fourpeople.check$Filter_No)
write.csv(nodes.table, "./fourpeople.nodes.table.csv", row.names = FALSE, quote=FALSE)
#outputing edges table

fourpeoplecombos = combinations(rownames(four.bray.dist.matrix))
#matrix2edges function
matrix2edges = function(matrix){
  edges.table = tibble(Source = character(), Target = character(), Weight = numeric(), Type=character())
  #generating all possible combinations given matrix's rownames!!!
  combos = combinations(rownames(matrix))
  for (i in 1:dim(combos)[1]){
    edges.table = bind_rows(edges.table, tibble(Source = combos$a[i], Target = combos$b[i], Weight = 1-matrix[combos$a[i], combos$b[i]], Type = "Undirected"))
    #Weight is similarity score
  }
  return(edges.table)
}
edges.table = matrix2edges(four.bray.dist.matrix)
write.csv(edges.table, "./fourpeople.edges.table.csv", row.names= FALSE, quote=FALSE)
    

#calculating distances between samples using first three PCs
four.pca.dist = as.matrix(vegdist(re$li, method="euclidean"))
euc.dist.vector = c()
for (name in unique(fourpeople.select$ownership)){
  print(name)
  temp.names = fourpeople.select$samplenames[fourpeople.select$ownership==name]
  temp.combos = as.data.frame(combinations(temp.names))
  bray.all.dist = 0
  euc.pca.dist = 0
#getting distances
  for (i in 1:dim(temp.combos)[1]){
    #print(i)
    #print(as.matrix(four.bray.dist)[temp.combos[i,"a"], temp.combos[i,"b"]])
    bray.all.dist = bray.all.dist + as.matrix(four.bray.dist)[temp.combos[i,"a"], temp.combos[i,"b"]]
    euc.pca.dist = euc.pca.dist + four.pca.dist[temp.combos[i,"a"], temp.combos[i,"b"]]
  }
  print(bray.all.dist/dim(temp.combos)[1])
  print(euc.pca.dist/dim(temp.combos)[1])
  #euc.dist.vector = c(euc.dist.vector, name=euc.pca.dist/dim(temp.combos)[1])
  euc.dist.vector[name] = euc.pca.dist/dim(temp.combos)[1]
  #print(temp.names)
}
euc.dist.vector = sort(euc.dist.vector, decreasing = TRUE)

activity.dist.vector = c(470, 50, 5, 1)
activity.df = data.frame(euc.dist = euc.dist.vector, actual.dist = activity.dist.vector, Ownership=c("P1", "P3", "P5", "P6"))
plot(euc.dist.vector, log(activity.dist.vector))


#elegant graphic output
my.formula = y ~ x
lmfit = lm(euc.dist ~ actual.dist, data=activity.df)
ggplot(activity.df, aes(x=actual.dist, y= euc.dist, color=Ownership)) + 
  geom_point(size=4, alpha=0.8) + 
  geom_smooth(aes(group=1),method = "glm", formula = my.formula, show.legend = TRUE) +
  # labs(x="Activity region", y="Profile dissimilarities")+
   stat_poly_eq(formula = my.formula,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(group=1,label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
                parse = TRUE) +
  scale_color_manual(values=c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"))+
  labs(x="Average geographic distance among samples", y="Average euclidean distance among samples")+
  theme_pubr(base_size = 14)
  

#plot(hclust(four.bray.dist, method = "ward.D2"), labels = fourpeople.select$ownership)
#plot(hclust(four.bray.dist, method = "centroid"), labels = fourpeople.select$ownership)


```
##Four people species names for exposome network, this is the old setup, using new interactions database now
```{r fourpeople species names}
#+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))
getname = function(profile){
  temp.df = master.cpm.single[, profile$samplenames]
  names = row.names(temp.df)[rowSums(temp.df)>10]
  print(head(rowSums(temp.df[names,]), 20))
  return(names)
}

mike.names = getname(fourpeople.select[fourpeople.select$ownership == "Mike",])
gw.names = getname(fourpeople.select[fourpeople.select$ownership == "Gw",])
guan.names = getname(fourpeople.select[fourpeople.select$ownership == "Guan_SF",])
no3.names = getname(fourpeople.select[fourpeople.select$ownership == "No3_San Mateo",])
fournames = list(mike.names = mike.names, gw.names=gw.names, guan.names=guan.names, no3.names =no3.names)
for (i in 1:length(fournames)){
  print(str(fournames[[i]]))
  towrite = c("Homo sapiens", fournames[[i]])
  write.table(towrite, paste0("~/Documents/Bioinfo/species-location-interaction_data/", i, "-names.txt"), quote = FALSE, row.names = FALSE, col.names = FALSE)
}


```
##Four people within individual vs others distance calculations.
```{r fourpeople distance one vs all}
fourpeoplespecial = function(profile, text.size=5, pick=1){
  #profile = fourpeople.select
  profile = data.frame(profile)
  master.cpm.snap = master.cpm.single[, profile$samplenames]
  bray.dist = vegdist(t(asinh(master.cpm.snap)), na.rm = TRUE, method= "bray")
  bray.dist.matrix = as.matrix(bray.dist)
  rownames(bray.dist.matrix) = profile$samplenames
  colnames(bray.dist.matrix) = profile$samplenames
  same.dist = c()
  different.dist = c()
  #same group vs different group implementation
  for (i in 1:(nrow(profile)-1)){
        for (j in (i+1):nrow(profile)){
          if (profile$aownership[i] == profile$aownership[j]){
    #if (profile$location[i] == profile$location[i+1]){
          same.dist = c(same.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
        } else if (profile$aownership[i] != profile$aownership[j]){
          different.dist = c(different.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
        } else {}
 
          }
        }
  
  
  #one vs all implementation
  owners = c(unique(profile$aownership))
  ownerpick = owners[pick]
  print(ownerpick)
  onegroup.dist = c()
  others.dist = c()
  control = 0
  for (i in 1:(nrow(profile)-1)){
    for (j in (i+1):nrow(profile)){
      control = control + 1
      if (profile$aownership[i] == profile$aownership[j] & profile$aownership[i] == ownerpick ){
        onegroup.dist = c(onegroup.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
      } else if (profile$aownership[i] != profile$aownership[j] & (profile$aownership[i] == ownerpick | profile$aownership[j] == ownerpick)){
        others.dist = c(others.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]])
      }
    }
  }
  #print(control)                                                                                                                                                                                   
  return(list(same.dist = same.dist,
              different.dist = different.dist,
              samedifferent.wilcox=wilcox.test(same.dist, different.dist),
              onegroup.dist = onegroup.dist,
              others.dist = others.dist,
              ownerpick = wilcox.test(onegroup.dist, others.dist)
              ))
}

fourpeople.select.nomike = fourpeople.select[fourpeople.select$aownership!="P1",]

fourpeople.result = fourpeoplespecial(fourpeople.select, pick=1)
fourpeople.result = fourpeoplespecial(fourpeople.select, pick=2)
fourpeople.result = fourpeoplespecial(fourpeople.select, pick=3)
fourpeople.result = fourpeoplespecial(fourpeople.select, pick=4)

fourpeople.result = fourpeoplespecial(fourpeople.select.nomike, pick=1)
fourpeople.result = fourpeoplespecial(fourpeople.select.nomike, pick=2)
fourpeople.result = fourpeoplespecial(fourpeople.select.nomike, pick=3)

dist.fourpeople.df = data.frame(distance = as.numeric(c(fourpeople.result$same.dist, fourpeople.result$different.dist)), group = c(rep("Same individual", length(fourpeople.result$same.dist)), rep("Different individual", length(fourpeople.result$different.dist))))

ggdensity(dist.fourpeople.df, x = "distance",
   add = "median", 
   #add.params=list(color="red"),
   rug = TRUE,
   color = "group", 
   fill = "group",
   palette = c("#00AFBB", "#E7B800"),
   ) + labs(x="Bray-Curtis Distance") + theme_pubr(base_size = 16)
#ggsave("~/Documents/Bioinfo/DNAformal/plots/location.pdf", width=6, height=4) 

```

#tSNE clustering===============================tSNE clustering
```{r tSNE clustering}
library(Rtsne)
#similar to PCA, the dataframe needs to be transposed. Columns should be traits, rows should be samples

#even with set seed, the reproduciblity seems to be an issue here
#will need to save the object for tsne for publication
set.seed(42)
distinct.colors = c("#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26")
#testing perplexity
for (k in seq(30,50,10)){
	set.seed(42123)
  k=30
	tsne = Rtsne(t(master.cpm.single), dims=2, perplexity=k, verbose=TRUE, max_iter =5000, initial_dims=100, theta=0.4)

	#tsne = Rtsne(t(master.asinh.cpm.single), dims=2, perplexity=k, verbose=TRUE, max_iter =5000, initial_dims=100, theta=0.4)
	#tsne = Rtsne(master.chisquare.cpm.single, dims=2, perplexity=k, verbose=FALSE, max_iter =5000)
	test.df = data.frame(tsne$Y)
	if (FALSE){
	if (dim(master.cpm.single)[1] < 50){
		test.df = data.frame(test.df, t(master.cpm.single[1:dim(master.cpm.single)[1], ]), scores_single_complete[, 4:length(names(scores_single_complete)) ])
	} else {
		test.df = data.frame(test.df, t(master.cpm.single[1:50,]), scores_single_complete[, 4:length(names(scores_single_complete))])
	}
	}
	testplot = ggplot(test.df, aes(x = X1,y = X2))
	testplot + geom_point(alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="#F38400")
	print(testplot)
	# + geom_text(nudge_y=0.2,  size=2.5) 
	# ggsave(paste0("./pdfs/",taxonlevel, "-", k  , "perplexity plot-all.pdf"))
	 #saveRDS(test.df, file = paste0("./data/DNA-", taxonlevel, "-", k, ".rds"))
}


#tSNE in 3d
if (FALSE){
tsne.3d = Rtsne(master.chisquare.cpm, dims=3, perplexity=k, verbos=TRUE, max_iter =5000)
test.df.3d = data.frame(tsne.3d$Y, Bacteria=master.cpm.single["Bacteria",], Fungi=master.cpm.single["Fungi",], Viridiplantae=master.cpm.single["Viridiplantae",], Metazoa=master.cpm.single["Metazoa",], Viruses=master.cpm.single["Viruses",], Archaea = master.cpm.single["Archaea",], scores_single_complete[, 4:length(names(scores_single_complete))])
colnames(test.df.3d)[1:3]=c("x",'y','z')
p = plot_ly(test.df.3d, x = ~x, y = ~y, z = ~z, type ="scatter3d", mode ="markers", color = ~Metazoa)
p

test.df.melt = melt(test.df, id.vars=c("X1", "X2", colnames(test.df)[46:61]), measure.vars = c(colnames(test.df)[3:15]))
testplot = ggplot(test.df.melt, aes(x=X1, y=X2, color=value, label=weekend))
testplot + geom_point()+ geom_text_repel()+ facet_wrap(~ variable, )

#for details at superkingdom level
if (taxonlevel == "SUPERKINGDOM"){
	test.df = data.frame(test.df, Bacteria=master.cpm.single["Bacteria",], Fungi=master.cpm.single["Fungi",], Viridiplantae=master.cpm.single["Viridiplantae",], Metazoa=master.cpm.single["Metazoa",], Viruses=master.cpm.single["Viruses",], Archaea = master.cpm.single["Archaea",], scores_single_complete[, 4:length(names(scores_single_complete))])
	
	testplot = ggplot(test.df, aes(x = X1,y = X2, label=location))
	testplot + geom_point(size =4, alpha=0.75) + ggtitle(paste0("DNA-", taxonlevel, "-perplexity=",k)) + theme_gray(base_size=15)
	
	testplot = ggplot(test.df, aes(x = X1,y = X2, color=ownership))
	testplot + geom_point(size = 4, alpha=0.75) + ggtitle(paste0("DNA-", taxonlevel, "-perplexity=",k))  + theme_gray(base_size=15)
	
	testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Viridiplantae))
	testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="dark green")	+ theme_gray(base_size=15)
	
	testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Bacteria))
	testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="red")	+ theme_gray(base_size=15)
	
	testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Fungi))
	testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="blue")	+ theme_gray(base_size=15)
	
	testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Metazoa))
	testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="purple")	+ theme_gray(base_size=15)
	
	testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Viruses))
	testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="brown")	+ theme_gray(base_size=15)
	
	testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Archaea, label=filter_list))
	testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="orange")	+ theme_gray(base_size=15) #+ geom_text_repel(aes(label=filter_list),size=2.5)+ theme_gray(base_size=15)
	
	testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=season))
	testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + theme_classic(base_size=15)  #+ geom_text_repel(aes(label = location), size=2.5)+ theme_gray(base_size=15)
}

testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=drywet))
testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + theme_gray(base_size=15)

testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=humid))
testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="blue")  + theme_gray(base_size=15)


test.df.two = test.df[grep("Mike|No1_1636", test.df$ownership),]
testplot = ggplot(test.df.two, aes(x = X1,y = X2, label=location, color=ownership))
testplot + geom_point(size =4, alpha=0.75) + ggtitle(paste0("Mike and patient1, perplexity=",k)) + theme_gray(base_size=15)#+ scale_color_manual(values=c( "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26"))

environ.sp = environ[match(test.df$filter_list, environ$Filter_No), ]

if (FALSE){ #for other projects
for (k in seq(10,50,5)){
	print(k)
	set.seed(42)
	tsne = Rtsne(t(mouse.table[,-1]), dims=2, perplexity=k, verbose=TRUE, max_iter =5000)
	test.df = data.frame(tsne$Y, sample= names(mouse.table[-1]))
	testplot = ggplot(test.df, aes(x = X1,y = X2, label=sample))
	testplot + geom_point(alpha=0.75) + ggtitle(paste0("perplexity=",k)) +
	#scale_color_manual(values=c("#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", #"#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26")) +
	 geom_text(nudge_y=0.2,  size=2.5) 
	 ggsave(paste0("./pdfs/mouse", "-", k  , "perplexity plot-all.pdf"))
}
}


k=25
set.seed(42)
tsne = Rtsne(t(Mike.master.cpm), dims=2, perplexity=k, verbose=TRUE, max_iter =5000)
	test.df = data.frame(tsne$Y)
	test.df = data.frame(test.df, Mike_single)
	testplot = ggplot(test.df, aes(x = X1,y = X2, color=season, label=location))
	testplot + geom_point(alpha=0.75) + ggtitle(paste0("perplexity=",k)) +
	scale_color_manual(values=c("#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26"))  + geom_text(nudge_y=0.3, size=2.5)
	ggsave(paste0("./pdfs/",taxonlevel, "-", k  , "perplexity plot-mike.pdf"))

	# + geom_text(nudge_y=0.4, size=2.5)


	# + geom_text(nudge_y=0.4, size=2.5)
}
```
	
#Cmeans clustering-season
```{r cmeans clustering}
library(cluster)
library(factoextra)
set.seed(42)
master.asinh.cpm.season = cbind.data.frame(t(master.asinh.cpm.single), season=scores_single_complete$season)

df.season = aggregate(. ~ season, master.asinh.cpm.season, mean)
df.season = df.season[, -1]
df.season = df.season[, colSums(df.season)>0]
season.var = apply(df.season,2,var)
var.thre = quantile(season.var, 0.5)
#df.season = df.season[, season.var >= 0.05] #variance filter
df.season.sel = df.season[, season.var >= var.thre]
weight = c(apply(df.season.sel, 2, var))
df.season.scale = scale(df.season.sel)
rownames(df.season.scale) = c("spring","summer","fall","winter")
wtf = t(df.season.scale)

#determining the number of clusters
# Elbow method
fviz_nbclust(wtf, kmeans, method = "wss") +
    geom_vline(xintercept = 4, linetype = 2)+ #this is hardcoded
  labs(subtitle = "Elbow method")
# Silhouette method
fviz_nbclust(wtf, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")
# Gap statistic
# nboot = 50 to keep the function speedy. 
# recommended value: nboot= 500 for your analysis.
# Use verbose = FALSE to hide computing progression.
set.seed(123)
fviz_nbclust(wtf, kmeans, nstart = 25,  method = "gap_stat", nboot = 100)+
  labs(subtitle = "Gap statistic method")

set.seed(123)
cm = cmeans(wtf, center=4, iter.max=500)
#cm$cluster = factor(cm$cluster, levels=c(1,2,3,4))
table(cm$cluster)
fviz_cluster(list(data = wtf, cluster=cm$cluster), 
             geom = c("point"),
             ellipse = TRUE,
             ellipse.alpha = 0.3, #used to be 0.6 if only points are plotted.
             ellipse.type = "norm",
             ellipse.level = 0.68,
             palette = "npg",
             repel = TRUE) + theme_pubr(base_size = 14) + scale_color_manual(values=c("3" = "#00A087FF","2" = "#4DBBD5FF","4" = "#E64B35FF","1"= "#3C5488FF")) +  scale_fill_manual(values=c("3" = "#00A087FF","2" = "#4DBBD5FF","4" = "#E64B35FF","1"= "#3C5488FF"))

tempp = cbind.data.frame(wtf, weight=weight, cluster=cm$cluster, membership=apply(cm$membership, 1, max), taxon = row.names(wtf))
df.season.scale.group = tempp[tempp$membership>=0.65,]
table(df.season.scale.group$cluster)
df.season.melt = melt(df.season.scale.group, id.vars = c("cluster","membership", "taxon","weight"), variable.name = "season")

#df.season.scale.group = df.season.scale[, cm$cluster==3 & cm$membership[,3]>=0.7]
#df.season.melt = melt(df.season.scale.group)
#colnames(df.season.melt) = c("season", "taxon", "value")

df.season.melt$season = factor(df.season.melt$season, levels=c("spring", "summer", "fall", "winter"))
df.season.melt$cluster = factor(df.season.melt$cluster)

#+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))
#green, blue, red, dark blue
ggplot(data=df.season.melt, aes(x=season, y=value, group=taxon, color=cluster, size=weight, alpha=membership)) +
  geom_line()+
  theme_pubr(base_size = 20) + scale_color_manual(values=c("3" = "#00A087FF","2" = "#4DBBD5FF","4" = "#E64B35FF","1"= "#3C5488FF")) + scale_x_discrete(expand=c(0,0)) #expand c(0,0) eliminates the sapcing


#season p test, p.adj were calculated
master_combined_season = cbind.data.frame(t(master.asinh.cpm.single[rowSums(master.asinh.cpm.single)>0,]), season = scores_single_complete$season)
master_season.melt =melt(master_combined_season, id.vars = "season")
p.result.season = compare_means( formula = value ~ season, data = master_season.melt,  group.by = "variable", method = "kruskal.test", p.adjust.method = "fdr")

```
##Cmeans clustering-month
```{r cmeans clustering-month}
#not working great
library(cluster)
library(factoextra)
set.seed(42)
master.asinh.cpm.month = cbind.data.frame(t(master.asinh.cpm.single[,]), date.month=scores_single_complete$date.month)

df.season = aggregate(. ~ date.month, master.asinh.cpm.month, mean)
df.season = df.season[, -1]
df.season = df.season[, colSums(df.season)>0]
df.season = df.season[, apply(df.season,2,var)>=0.05] #variance filter
weight = c(apply(df.season, 2, var))
df.season.scale = scale(df.season)
rownames(df.season.scale) = c(1:12)
wtf = t(df.season.scale)

#determining the number of clusters
# Elbow method
fviz_nbclust(wtf, kmeans, method = "wss") +
    geom_vline(xintercept = 4, linetype = 2)+ #this is hardcoded
  labs(subtitle = "Elbow method")
# Silhouette method
fviz_nbclust(wtf, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")
# Gap statistic
# nboot = 50 to keep the function speedy. 
# recommended value: nboot= 500 for your analysis.
# Use verbose = FALSE to hide computing progression.
set.seed(123)
fviz_nbclust(wtf, kmeans, nstart = 25,  method = "gap_stat", nboot = 100)+
  labs(subtitle = "Gap statistic method")

cm = cmeans(wtf, center=2, iter.max=200)
cm$cluster = factor(cm$cluster, levels=c(1:8))
fviz_cluster(list(data = wtf, cluster=cm$cluster), 
             geom = c("point"),
             ellipse = TRUE,
             ellipse.alpha = 0.6, #used to be 0.6 if only points are plotted.
             ellipse.type = "norm",
             ellipse.level = 0.68,
             palette = "npg",
             repel = TRUE) + theme_pubr(base_size = 14) 

tempp = cbind.data.frame(wtf, weight=weight, cluster=cm$cluster, membership=apply(cm$membership, 1, max), taxon = row.names(wtf))
df.season.scale.group = tempp[tempp$membership>0.5
                              ,]
df.season.melt = melt(df.season.scale.group, id.vars = c("cluster","membership", "taxon","weight"), variable.name = "season")

#df.season.scale.group = df.season.scale[, cm$cluster==3 & cm$membership[,3]>=0.7]
#df.season.melt = melt(df.season.scale.group)
#colnames(df.season.melt) = c("season", "taxon", "value")

df.season.melt$season = factor(df.season.melt$season, levels=c(1:12))
df.season.melt$cluster = factor(df.season.melt$cluster)

#+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))
#green, blue, red, dark blue
ggplot(data=df.season.melt, aes(x=season, y=value, group=taxon, color=cluster, size=weight)) +
  geom_line(alpha=0.5)+
  theme_pubr(base_size = 14) + scale_color_npg()

```

#seasonal p test
```{r season p test}
#season p test on genus using all samples and P1's samples, p.adj were calculated
#all samples
master.asinh.cpm.single.fifty = master.asinh.cpm.single[rowSums(master.asinh.cpm.single > 0) >= 50,]

master_combined_season = cbind.data.frame(t(master.asinh.cpm.single.fifty), season = scores_single_complete$season)
master_season.melt =melt(master_combined_season, id.vars = "season")
p.result.season = compare_means( formula = value ~ season, data = master_season.melt,  group.by = "variable", method = "kruskal.test", p.adjust.method = "fdr") 
rownames(p.result.season) = p.result.season$variable
#the 9 phyla from figure
season.phyla.select = c("Basidiomycota","Ascomycota","Streptophyta","Chordata","Acidobacteria", "Firmicutes", "Proteobacteria","Actinobacteria", "Bacteroidetes")
p.season.select.df = master_combined_season[, c(as.character(season.phyla.select), "season")]
p.season.select.df.melt =  melt(p.season.select.df, id.vars = "season")
#ranking genera by adjusted pvalues
comp = function(master_combined, compare_result){
  master_combined.sig= master_combined[, c(compare_result$p.adj < 0.05, TRUE)]
  compare_result.sig = compare_result[compare_result$p.adj < 0.05,]
  compare_result.sig$variable = as.character(compare_result.sig$variable)
  compare_result.sig = compare_result.sig[order(compare_result.sig$p.adj),]
  #rank names by inverse order of p.adjust, first one is the smallest
  genus.select = as.character(compare_result.sig$variable)
  master_combined.sig.melt = melt(master_combined.sig[, as.character(c(genus.select, "season"))], id.vars = "season")
  return(list(df = master_combined.sig, df.melt = master_combined.sig.melt, compare = compare_result.sig, genera = genus.select)) 
}

p.season.results = comp(master_combined_season, p.result.season)


#all samples, all significant taxa (use 90 x 20 to start for saving pdfs)
ggplot(p.season.results$df.melt, aes(x = season , y = value, color=season, fill=season)) + 
  geom_boxplot(size=0.6, alpha=0.5)+ 
  geom_quasirandom(size=0.6, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ 
    stat_summary(fun.y=median, geom="line", aes(group=1), color = "black", alpha=0.6) +
 stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.5, show.legend = FALSE) +
  theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ 
  scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) +
  scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))+
  theme(strip.background = element_blank()) +
  rremove("x.text") +
  labs(color = "Season", fill="Season") +
  facet_wrap(~variable, scales="free_y", ncol=5)
   #expa

#select phyla, all samples
ggplot(p.season.select.df.melt, aes(x = season , y = value, color=season, fill=season)) + 
  geom_boxplot(size=0.6, alpha=0.5)+ 
  geom_quasirandom(size=0.4, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ 
    stat_summary(fun.y=median, geom="line", aes(group=1), color = "black", alpha=0.6) +
 stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) +
  theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ 
  scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) +
  scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))+
  theme(strip.background = element_blank()) +
  rremove("x.text") +
  labs(color = "Season", fill="Season") +
  facet_wrap(~variable, scales="free_y", ncol=3)


#P1 samples, all significant taxa
p1_combined_season = cbind.data.frame(t(master.asinh.cpm.single.fifty[,Mike_single$samplenames]), season = Mike_single$season)
p1_combined_season.melt =melt(p1_combined_season, id.vars = "season")
p1.result.season = compare_means( formula = value ~ season, data = p1_combined_season.melt,  group.by = "variable", method = "kruskal.test", p.adjust.method = "fdr") 

p1.season.results = comp(p1_combined_season, p1.result.season) #this is a list

p1.season.select.df = p1_combined_season[, c(as.character(season.phyla.select), "season")]
p1.season.select.df.melt =  melt(p1.season.select.df, id.vars = "season")


ggplot(p1.season.results$df.melt, aes(x = season , y = value, color=season, fill=season)) + 
  geom_boxplot(size=0.6, alpha=0.5)+ 
  geom_quasirandom(size=0.6, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ 
  stat_summary(fun.y=median, geom="line", aes(group=1), color = "black", alpha=0.6) +
 stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) +
  theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ 
  scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) +
  scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))+
  theme(strip.background = element_blank()) +
  rremove("x.text") +
   labs(color = "Season", fill="Season") +
  facet_wrap(~variable, scales="free_y", ncol=5)


#p1 samples, selected phyla
ggplot(p1.season.select.df.melt, aes(x = season , y = value, color=season, fill=season)) + 
  geom_boxplot(size=0.6, alpha=0.5)+ 
  geom_quasirandom(size=0.4, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ 
    stat_summary(fun.y=median, geom="line", aes(group=1), color = "black", alpha=0.6) +
 stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) +
  theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ 
  scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) +
  scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))+
  theme(strip.background = element_blank()) +
  rremove("x.text") +
  labs(color = "Season", fill="Season") +
  facet_wrap(~variable, scales="free_y", ncol=3)

#p2 samples, all phyla
p2_combined_season = cbind.data.frame(t(master.asinh.cpm.single.fifty[,Patient1$samplenames]), season = Patient1$season)
p2_combined_season.melt =melt(p2_combined_season, id.vars = "season")
p2.result.season = compare_means( formula = value ~ season, data = p2_combined_season.melt,  group.by = "variable", method = "kruskal.test", p.adjust.method = "fdr") 

p2.season.results = comp(p2_combined_season, p2.result.season) #this is a list, will produce an error if no genus has a adj.p < 0.05, P2 is such a case


#plot all pvalues for all samples
p.season.results$compare$variable = factor(p.season.results$compare$variable, levels = c(p.season.results$compare$variable))


ggplot(p.season.results$compare, aes(x = variable, y=p.adj, color=p.adj)) + geom_point() + theme_pubr(base_size=10, x.text.angle = 60) + theme(axis.text.x = element_text(size = 6))


chisq.test(x=rbind(c(79/2,79/2),c(61,18)))
```

#Spatial variables construct
```{r spatial variables MEM construct}
library(adespatial)
library(spacemakeR)
#some missing values in original coordinates data, run imputation first from the RDA section
expo.xy = subset(rda.env, select = c(latitude, longitude))
expo.xy.name = cbind(expo.xy, name = row.names(expo.xy))
colnames(expo.xy.name) = c("lat", "lon", "name")

#taking coordinates and calculate geographic distance between sites, unit is in m
ReplaceLowerOrUpperTriangle <- function(m, triangle.to.replace){
   # If triangle.to.replace="lower", replaces the lower triangle of a square matrix with its upper triangle.
   # If triangle.to.replace="upper", replaces the upper triangle of a square matrix with its lower triangle.

   if (nrow(m) != ncol(m)) stop("Supplied matrix must be square.")
   if      (tolower(triangle.to.replace) == "lower") tri <- lower.tri(m)
   else if (tolower(triangle.to.replace) == "upper") tri <- upper.tri(m)
   else stop("triangle.to.replace must be set to 'lower' or 'upper'.")
   m[tri] <- t(m)[tri]
   return(m)
}

GeoDistanceInMetresMatrix <- function(df.geopoints){
   # Returns a matrix (M) of distances between geographic points.
   # M[i,j] = M[j,i] = Distance between (df.geopoints$lat[i], df.geopoints$lon[i]) and
   # (df.geopoints$lat[j], df.geopoints$lon[j]).
   # The row and column names are given by df.geopoints$name.

   GeoDistanceInMetres <- function(g1, g2){
      # Returns a vector of distances. (But if g1$index > g2$index, returns zero.)
      # The 1st value in the returned vector is the distance between g1[[1]] and g2[[1]].
      # The 2nd value in the returned vector is the distance between g1[[2]] and g2[[2]]. Etc.
      # Each g1[[x]] or g2[[x]] must be a list with named elements "index", "lat" and "lon".
      # E.g. g1 <- list(list("index"=1, "lat"=12.1, "lon"=10.1), list("index"=3, "lat"=12.1, "lon"=13.2))
      DistM <- function(g1, g2){
         require("Imap")
         return(ifelse(g1$index > g2$index, 0, gdist(lat.1=g1$lat, lon.1=g1$lon, lat.2=g2$lat, lon.2=g2$lon, units="m")))
      }
      return(mapply(DistM, g1, g2))
   }

   n.geopoints <- nrow(df.geopoints)

   # The index column is used to ensure we only do calculations for the upper triangle of points
   df.geopoints$index <- 1:n.geopoints

   # Create a list of lists
   list.geopoints <- by(df.geopoints[,c("index", "lat", "lon")], 1:n.geopoints, function(x){return(list(x))})

   # Get a matrix of distances (in metres)
   mat.distances <- ReplaceLowerOrUpperTriangle(outer(list.geopoints, list.geopoints, GeoDistanceInMetres), "lower")

   # Set the row and column names
   rownames(mat.distances) <- df.geopoints$name
   colnames(mat.distances) <- df.geopoints$name

   return(mat.distances)
}
expo.geo.xy = round(GeoDistanceInMetresMatrix(expo.xy.name) / 1000) #distance in km
expo.pcnm2 = vegan::pcnm(expo.geo.xy)

expo.vario = variogmultiv(Y = rda.data, xy = expo.xy, nclass =20 )
plot(expo.vario$d, expo.vario$var, ty='b', pch=20, xlab="Distance", ylab="C(distance)") #210 is the maxmium
#first peak is
expo.pcnm = vegan::pcnm(dist(expo.xy))
min.d = expo.pcnm$threshold #minimum distance
thresh10 = seq(give.thresh(dist(expo.xy)), give.thresh(dist(expo.xy))*4, le=10)
list10nb = lapply(thresh10, dnearneigh, x=as.matrix(expo.xy), d1=0)
f2 = function(D, dmax, y) {1- (D/dmax)^y}
expo.thresh.f2 = lapply(list10nb, function(x) test.W(x, Y=rda.data, f=f2, y=2:10, dmax=max(unlist(nbdists(x, as.matrix(expo.xy)))), xy=as.matrix(expo.xy)))

expo.f2.minAIC = sapply(expo.thresh.f2, function(x) min(x$best$AICc, na.rm=TRUE))
min(expo.f2.minAIC)
nb.bestmod = which.min(expo.f2.minAIC) #6 variables
dmax.best = expo.thresh.f2[nb.bestmod][[1]]$all[1,2]
expo.MEM.champ = unlist(expo.thresh.f2[which.min(expo.f2.minAIC)], recursive = FALSE)
summary(expo.MEM.champ)
expo.MEM.champ$best$values
expo.MEM.champ$best$ord
MEMid = expo.MEM.champ$best$ord[1:which.min(expo.MEM.champ$best$AICc)]
sort(MEMid)
expo.MEM.champ$best$R2
MEM.select = expo.MEM.champ$best$vectors[, sort(c(MEMid))]
colnames(MEM.select) = sort(MEMid)
#best R2
R2.MEMbest = expo.MEM.champ$best$R2[which.min(expo.MEM.champ$best$AICc)]
#adjusted best R2
RsquareAdj(R2.MEMbest, nrow(rda.data), length(MEMid))
par(mfrow=c(2,4))
for(i in 1:ncol(MEM.select)){ #visualizing selected MEM factors
  s.value(expo.xy, MEM.select[,i], sub=sort(MEMid)[i], csub=2)
}

expo.MEM.rda = rda(rda.data ~., as.data.frame(MEM.select))
expo.MEM.R2a = RsquareAdj(expo.MEM.rda)$adj.r.squared
anova(expo.MEM.rda)
axes.MEM.test = anova(expo.MEM.rda, by="axis")
nb.ax = length(which(axes.MEM.test[,5] <= 0.05))

#plot maps of the significant canonocial axes
expo.MEM.axes = scores(expo.MEM.rda, choices=c(1,2), display="lc", scaling=1)
par(mfrow = c(1,2))
s.value(expo.xy, expo.MEM.axes[,1])
s.value(expo.xy, expo.MEM.axes[,2])

#visualizing top two axes (tested by axes.MEM.test)
for (i in 1:2){
  print(ggplot(expo.xy, aes(y=latitude, x=longitude, size = expo.MEM.axes[,i])) + geom_point(shape=21, alpha=0.6, fill="steel blue") + labs(size = paste0("RDA.MEM.axis ", i)))
}
  
#visualizing MEM.select using ggplot
for (i in 1:ncol(MEM.select)){
  print(ggplot(expo.xy, aes(y=latitude, x=longitude, size = MEM.select[,i])) + geom_point(shape=21, alpha=0.6, fill="steel blue") + labs(size = paste0("MEM variable ", i)))
}


#testing location~season randomness
library(nnet)
locaseason = cbind(as.data.frame(plot.MEM.select), season = scores_single_complete$season)
test <- multinom(season ~ ., data = locaseason)
summary(test)

```
#Time variables construction AEM
```{r AEM construction of time-series}
library(PCNM)
library(AEM)
#adjusting date.mid so samples don't have the exact same date.mid, this is for AEM variables construction
AEM.in = scores_single_complete$date.mid 
for (i in 1:282){
  print(AEM.in[i+1]-AEM.in[i])
  delta = AEM.in[i+1]-AEM.in[i]
  if (delta == 0){
    AEM.in[i+1] = AEM.in[i+1] + 0.5
  }
  if (delta <0){
    AEM.in[i+1] = AEM.in[i+1] + 1
  }
}

#choosing the max.d for weights calculating in non-equal-distanced sampling points
#mantel correlation is the multivariate version of pearson correlation
correlog.expo <- mantel.correlog(dist(t(master.asinh.cpm.single)), XY=AEM.in, mult = "fdr") #this can be used for spatial as well
#correlog.expo <- mantel.correlog(dist(t(Mike.master.asinh.cpm)), XY=Mike_single$date.start, mult = "fdr")
#the n.class has some impact on the class.index, but not much
plot(correlog.expo) #looks like 131 is the way to go, smallest non-significant index value
weights.expo = weight.time(AEM.in, max.d = 131) # each unit is one day 1 = 1 day
aem.expo.out = aem.time(length(AEM.in), w = weights.expo, moran=TRUE, plot.moran=TRUE)
aem.expo.out$Moran
#select significant positive variables, not forward selected
aem.expo.out.pos = aem.expo.out$aem[,aem.expo.out$Moran$p.value < 0.05 & aem.expo.out$Moran$Positive]
rda.data = t(master.asinh.cpm.single)
aem.expo.rda.pos <- rda(rda.data ~ ., as.data.frame(aem.expo.out.pos)) #the second arugment needs to be data frame in formula mode
anova(aem.expo.rda.pos)
anova(aem.expo.rda.pos,  by="axis")
RsquareAdj(aem.expo.rda.pos)
plot(AEM.in, scores(aem.expo.rda.pos, display="lc", choice=3), type="b", pch=19, main = paste("RDA axis", i, ", positive temporal correlation model"), xlab="Date", ylab="RDA axis")


#select significant negative variables, not forward selected
aem.expo.out.neg = aem.expo.out$aem[,aem.expo.out$Moran$p.value < 0.05 & !aem.expo.out$Moran$Positive]
aem.expo.rda.neg <- rda(t(master.asinh.cpm.single), aem.expo.out.neg) 
anova(aem.expo.rda.neg)
RsquareAdj(aem.expo.rda.neg)


aem.expo.out.select = aem.expo.out$aem[,aem.expo.out$Moran$p.value < 0.05] #both positive and negative
#this step takes a lot of time, so the result was saved
#expo.aem.selectvar <- forward.sel(t(master.asinh.cpm.single), aem.expo.out.select, nperm=999, alpha=0.05) 
expo.aem.selectvar
#saveRDS(object = expo.aem.selectvar, file="~/Documents/Bioinfo/RDS/expo.aem.selectvar.rds")
expo.aem.selectvar = readRDS("~/Documents/Bioinfo/RDS/expo.aem.selectvar.rds")
aem.expo.rda.fp <- rda(t(master.asinh.cpm.single) ~., data.frame(aem.expo.out.select[, expo.aem.selectvar$order])) #only the forward-picked ones
RsquareAdj(aem.expo.rda.fp)
aem.expo.rda.anova = anova(aem.expo.rda.fp)

#this step also takes alot of time, and the first two axes are significant, 8% and 2%, respectively
#aem.expo.rda.anova.byaxis = anova(aem.expo.rda.fp, by="axis") 
#saveRDS(object = aem.expo.rda.anova.byaxis, file = "~/Documents/Bioinfo/RDS/aem.expo.rda.byaxis.rds")
aem.expo.rda.anova.byaxis = readRDS("~/Documents/Bioinfo/RDS/aem.expo.rda.byaxis.rds")
#only first two axes are significant
RsquareAdj(aem.expo.rda.fp)

#only the first two axes are significant, axis 2 looks cyclic, axis 1 is very noisy, axis 3 looks interesting
plot(AEM.in, scores(aem.expo.rda.fp, display="lc", choice=1), type="b", pch=19, main = paste("RDA axis", 1, ", positive temporal correlation model"), xlab="Date", ylab="RDA axis")
plot(AEM.in, scores(aem.expo.rda.fp, display="lc", choice=2), type="b", pch=19, main = paste("RDA axis", 2, ", positive temporal correlation model"), xlab="Date", ylab="RDA axis")
plot(AEM.in, scores(aem.expo.rda.fp, display="lc", choice=3), type="b", pch=19, main = paste("RDA axis", 2, ", positive temporal correlation model"), xlab="Date", ylab="RDA axis")

ggplot(data.frame(time = AEM.in, score = c(scores(aem.expo.rda.fp, display="lc", choice=1))), aes(x=time, y=score))+ geom_point() + geom_line() + theme_pubr()
ggplot(data.frame(time = AEM.in, score = c(scores(aem.expo.rda.fp, display="lc", choice=2))), aes(x=time, y=score))+ geom_point() + geom_line() + theme_pubr()

ggplot(data.frame(time = AEM.in, score = c(scores(aem.expo.rda.fp, display="lc", choice=1))), aes(x=time, y=score))+ geom_point() + geom_smooth(span=0.3) + theme_pubr()
ggplot(data.frame(time = AEM.in, score = c(scores(aem.expo.rda.fp, display="lc", choice=2))), aes(x=time, y=score))+ geom_point() + geom_smooth(span=0.5) + theme_pubr()

ts1 = scores(aem.expo.rda.fp, display="lc", choice=1)[,1] #RDA axis 1
  
ts2 = scores(aem.expo.rda.fp, display="lc", choice=2)[,1] #RDA axis 2

AEMS = aem.expo.out.select[, expo.aem.selectvar$order] #all selected 38 AEM variables.
colnames(AEMS) = paste0("AEM", expo.aem.selectvar$order) #assign original names to these variables
```


##RDA========================RDA, redundancy analysis
```{r RDA analysis}
env.select = c(6:12, 15:27, 30:58, 105:117, 119:135) # this is for scores_single_complete
#do not select any clinical variables
#rda.data = t(master.cpm.single)
rda.data = t(master.asinh.cpm.single)
rda.var = apply(rda.data, 2, var)
#rda.select = apply(rda.data,2, function(x){ sum(x>0)>=50})
#rda.data= rda.data[, rda.select]
rda.env = scores_single_complete[, env.select]

names(rda.env)[apply(rda.env, 2, function(x){ sum(!is.na(x)) >= 280})] #this tests to see how complete each feature is
#dbRDA

#CRUDE DATA imputation
items = c("MeanDew_PointC", "median.particle", "durationOut", "elevation.m.", "mNDVI", "Overall.AQI.Value", "latitude", "longitude", "Population_density_people_per_sqmi", "X_Mean_Wind_SpeedKm.h", "mFPAR", "dPM2.5", "dPM10", "mLAI",   "X_Mean_VisibilityKm" ,"X_Mean_Sea_Level_PressurehPa", "population2", "dSO2", "dNO2", "dOzone", "dCO", "daily_NO2", "In_in_a_day", "X_Max_Sea_Level_PressurehPa", "X_Min_Sea_Level_PressurehPa", "X_Max_Wind_SpeedKm.h", "mNO2", "mSO2")
for (x in items){
  print(x)
  rda.env[,x][is.na(rda.env[, x])] = as.numeric(median(rda.env[, x], na.rm=TRUE))
}
rda.env$urban_rural[is.na(rda.env$urban_rural)] = "rural"
rda.env$batch = as.numeric(rda.env$batch)
names(rda.env)[apply(rda.env, 2, function(x){ sum(!is.na(x)) >= 280})]
names(rda.env)[apply(rda.env, 2, function(x){ sum(!is.na(x)) <= 250})]
rda.env = rda.env[, names(rda.env)[apply(rda.env, 2, function(x){ sum(!is.na(x)) >= 280})]]
#eleminating some obvious redudant variables
rda.env.location = subset(rda.env, select=-c(Type, geo, date.mid, Precipitationmm, ownership, country, population2, city))
rda.env = subset(rda.env, select=-c(Type, geo, date.mid, Precipitationmm, ownership, country, population2, city, location ))#Type only has one class


#Recoding variables
location.bi = model.matrix(~rda.env.location[,"location"]+0)
geo2.bi = model.matrix(~rda.env[, "geo2"]+0)
season.bi = model.matrix(~rda.env[, "season"]+0)
weekend.bi = model.matrix(~rda.env[, "weekend"]+0)[,-1]
hiking.bi = model.matrix(~rda.env[, "hiking"]+0)[,-1]
geo3.bi = model.matrix(~rda.env[, "geo3"]+0)
aownership.bi = model.matrix(~rda.env[, "aownership"]+0)
urban.bi = model.matrix(~rda.env[, "urban_rural"]+0)[, -1]
#all numeric ones
rda.env.numeric = Filter(is.numeric, rda.env)

#MAKING env great againt

#======================
#using location.bi, instead of MEM.select
if (FALSE){
rda.env.coding.location = cbind(rda.env.numeric, location.bi, geo2.bi, season.bi, weekend.bi, hiking.bi, geo3.bi, aownership.bi, urban.bi)
rda.env.coding.location = subset(rda.env.coding.location, select= -c(cDNA_conc)) #removing these variables
#cDNA_conc, while a good variable for variation analysis, cannot be easily grouped into existing groups, such as environment/location/lifestyle
#with raw location
expo.RDA.location = rda(rda.data ~., data = rda.env.coding.location, add=TRUE, na.action = na.exclude)
expo.dbRDA.location = capscale(rda.data ~ ., dist="bray", data=rda.env.coding.location, add=TRUE, na.action = na.exclude)
RDA.location.radj = RsquareAdj(expo.RDA.location)$adj.r.squared
dbRDA.location.radj = RsquareAdj(expo.dbRDA.location)$adj.r.squared

#anova #raw R2 are about 46% and 48.2% for RDA and dbRDA.
anova.expo.location.RDA = anova(expo.RDA.location)
anova.expo.dbRDA.location = anova(expo.dbRDA.location)
}

#====================
#attaching MEM variables to the rda.env data.frame, USING MEM instead of locations! also attaching time-series variables, which are the two axes from AEM constructions
rda.env.coding = cbind(rda.env.numeric, MEM.select, geo2.bi, season.bi, weekend.bi, hiking.bi, geo3.bi, aownership.bi, urban.bi, ts1, ts2)

#with MEMs
rda.env.coding2 = subset(rda.env.coding, select =-c(cDNA_conc)) #cDNA_conc apparently accout for a lot of variances..

expo.RDA=rda(rda.data ~ .,  data=rda.env.coding2, add=TRUE, na.action = na.exclude)
#33.88% explained
expo.dbRDA = capscale(rda.data ~ ., dist="bray", data=rda.env.coding2, add=TRUE, na.action = na.exclude)
#35.16%

RDA.radj = RsquareAdj(expo.RDA)$adj.r.squared
dbRDA.radj = RsquareAdj(expo.dbRDA)$adj.r.squared

#anova
#anova(expo.RDA, by="axis")
#raw R2 are about 32.7% for RDA and dbRDA when using MEM
anova.expo.RDA = anova(expo.RDA)
anova.expo.dbRDA = anova(expo.dbRDA)

#using adespatial for forward selection

RDA.fwd = forward.sel(rda.data, rda.env.coding2, adjR2thresh = RDA.radj, alpha = 0.05)
RDA.location.fwd = forward.sel(rda.data, rda.env.coding.location, adjR2thresh = RDA.location.radj, alpha = 0.05)

#RDA with all selected variables
expo.RDA.select=rda(rda.data ~ .,  data=rda.env.coding2[, RDA.fwd$variables], add=TRUE, na.action = na.exclude)
#22.2% explained
expo.dbRDA.select = capscale(rda.data ~ ., dist="bray", data=rda.env.coding2[, RDA.fwd$variables], add=TRUE, na.action = na.exclude)
#22.7%

#vegan variable selection procedure, not divided into group yet, for rda
#using MEM variables, this should be the same as forward.sel
mod0 = rda(rda.data ~ 1, rda.env.coding2) #model with intercept only
mod1 = rda(rda.data ~., rda.env.coding2) #model with all variables
step.res = ordiR2step(mod0, mod1, perm.max =1000) #about 22% explained

#vegan variable selection procedure overall, for dbRDA
#using MEM variables
mod0.db = capscale(rda.data ~ 1, dist="bray", data=rda.env.coding2, add=TRUE)
mod1.db = capscale(rda.data ~., dist="bray", data=rda.env.coding2, add=TRUE)
step.res.db = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about 18% explained

if(FALSE){ #raw locations are no longer used
#vegan variable selection procedure, not divided into group yet, for rda
#NOT using MEM variables, RAW LOCATION labels
mod0 = rda(rda.data ~ 1, rda.env.coding.location) #model with intercept only
mod1 = rda(rda.data ~., rda.env.coding.location) #model with all variables
step.res.location = ordiR2step(mod0, mod1, perm.max =1000)

#vegan variable selection procedure overall, for dbRDA
#NOT using MEM variables, RAW LOCATION labels
mod0.db = capscale(rda.data ~ 1, dist="bray", data=rda.env.coding.location, add=TRUE)
mod1.db = capscale(rda.data ~., dist="bray", data=rda.env.coding.location, add=TRUE)
step.res.db.location = ordiR2step(mod0.db, mod1.db, perm.max = 1000)
}

#rda.env.coding2 does not have cDNA conc.
#five groups defined initially: technical, environmental, location, life style, temporal groups 
#forward selection within each group, and then in combined groups
#1
technical = data.frame(rda.env.coding[,"batch"])
#2
env = cbind.data.frame(rda.env.coding[, c(3:14, 22:34,48,49,51,50)])
colnames(env)[26:29] = c("spring", "summer", "winter", "fall")
#3
loca = cbind.data.frame(rda.env.coding[, c(18:21, 35:40, 41:47, 54:55, 65)])
colnames(loca)[3] = "popdensity"
colnames(loca)[11:19] = c("Asia", "Europe", "Northeast", "Oceania", "Southeast", "Southwest", "Westcoast", "Coastal", "Inland")
colnames(loca)[5:10] = c("MEM1", "MEM69", "MEM83", "MEM89", "MEM91", "MEM94") #numbers were causing issues
#4
behav = cbind.data.frame(rda.env.coding[, c(15:16, 52:53, 56:64)])
colnames(behav)[5:13] = c("others", "P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8")
#5
temporal = cbind.data.frame(rda.env.coding[, c(2, 66:67)]) #2 is the "date.month"

#rda forward selections of every group, using MEM and AEM variables (through RDA axes)
mod0 = rda(rda.data ~ 1, technical) #model with intercept only
mod1 = rda(rda.data ~., technical) #model with all variables
step.res.tech = ordiR2step(mod0, mod1, perm.max =1000) #about % explained

mod0 = rda(rda.data ~ 1, env) #model with intercept only
mod1 = rda(rda.data ~., env) #model with all variables
step.res.env = ordiR2step(mod0, mod1, perm.max =1000) #about % explained
step.forward.env = forward.sel(rda.data, env)

mod0 = rda(rda.data ~ 1, loca) #model with intercept only
mod1 = rda(rda.data ~., loca) #model with all variables
step.res.loca = ordiR2step(mod0, mod1, perm.max =1000) #about % explained
step.forward.loca = forward.sel(rda.data, loca)

mod0 = rda(rda.data ~ 1, behav) #model with intercept only
mod1 = rda(rda.data ~., behav) #model with all variables
step.res.behav = ordiR2step(mod0, mod1, perm.max =1000) #about % explained
behav.R2a = RsquareAdj(mod1)$adj.r.squared
step.forward.behav = forward.sel(rda.data, behav, adjR2thresh = behav.R2a)

mod0.db = rda(rda.data ~ 1, data=temporal, add=TRUE)
mod1.db = rda(rda.data ~., data=temporal, add=TRUE)
step.res.temporal = ordiR2step(mod0.db, mod1.db, perm.max = 1000)
#temporal.select = temporal[, attributes(step.res.db.temporal$terms)$term.labels]


#vegan variable selection procedure for every group, for dbRDA (used in paper)
#using MEM variables
mod0.db = capscale(rda.data ~ 1, dist="bray", data=technical, add=TRUE)
mod1.db = capscale(rda.data ~., dist="bray", data=technical, add=TRUE)
step.res.db.tech = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about % explained                      
colnames(technical) = "batch"
technical.select.db = technical

mod0.db = capscale(rda.data ~ 1, dist="bray", data=env, add=TRUE)
mod1.db = capscale(rda.data ~., dist="bray", data=env, add=TRUE)
step.res.db.env = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about % explained   
env.select.db = env[, attributes(step.res.db.env$terms)$term.labels]


mod0.db = capscale(rda.data ~ 1, dist="bray", data=loca, add=TRUE)
mod1.db = capscale(rda.data ~., dist="bray", data=loca, add=TRUE)
step.res.db.loca = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about % explained   
loca.select.db = loca[, attributes(step.res.db.loca$terms)$term.labels]

mod0.db = capscale(rda.data ~ 1, dist="bray", data=behav, add=TRUE)
mod1.db = capscale(rda.data ~., dist="bray", data=behav, add=TRUE)
step.res.db.behav = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about % explained   
behav.select.db = behav[, attributes(step.res.db.behav$terms)$term.labels]

mod0.db = capscale(rda.data ~ 1, dist="bray", data=temporal, add=TRUE)
mod1.db = capscale(rda.data ~., dist="bray", data=temporal, add=TRUE)
step.res.db.temporal = ordiR2step(mod0.db, mod1.db, perm.max = 1000)
temporal.select.db = temporal[, attributes(step.res.db.temporal$terms)$term.labels]

#combining loca and behav for variable selection, because varpart deals with 4 groups
locabehav = cbind(loca, behav)
mod0.db = capscale(rda.data ~ 1, dist = "bray", data = locabehav, add = TRUE)
mod1.db = capscale(rda.data ~., dist = "bray", data = locabehav, add = TRUE)
step.res.db.locabehav = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about % explained   
locabehav.select.db = cbind.data.frame(loca,behav)[, attributes(step.res.db.locabehav$terms)$term.labels]


#combining all for variable selection
allfactors = cbind(env, loca, behav, technical,temporal) #when including TS1 and TS2, a lot more variables will be selected, indicating that these two variables explain data variances well, this is expected because they were constructed by contraining the dataset on temporal variablesl
allfactors = cbind(env, loca, behav, technical, date.month=temporal[,1])
mod0.db = capscale(rda.data ~ 1, dist = "bray", data = allfactors) #using add = TRUE option will give different results, more variables
mod1.db = capscale(rda.data ~., dist = "bray", data = allfactors)
step.res.db.all = ordiR2step(mod0.db, mod1.db, perm.max = 1000, pin = 0.05) #about % explained  
all.select.db = allfactors[, attributes(step.res.db.all$terms)$term.labels]


#calculating distance between samples
rda.data.dist = vegdist(rda.data,method = "bray",na.rm = TRUE, diag = FALSE)
#dbRDA varpart, assuming four groups
dbRDA.MEM.varpart = varpart(rda.data.dist, technical.select.db, env.select.db, loca.select.db, behav.select.db)
#lumping spatial and behavior together, not adding temporal group for now
dbRDA.MEM.varpart = varpart(rda.data.dist, technical.select.db, env.select.db, locabehav.select.db)
#lumping location and behavior together, so three groups + temporal group
dbRDA.MEM.varpart = varpart(rda.data.dist, technical.select.db, env.select.db, locabehav.select.db, temporal.select.db)
#the techinical is 2.6%, the enviromental, locabehav, and the temporal all account for 10% each.
par(mfrow = c(1,2))
showvarparts(4)
plot(dbRDA.MEM.varpart)

#crude varpart without forward selection of each group
dbRDA.MEM.crude.varpart = varpart(rda.data.dist, technical, env, loca, behav)
dbRDA.MEM.crude.varpart = varpart(rda.data.dist, technical, env, cbind(loca, behav)) #lumping location and behavior together
par(mfrow = c(1,2))
showvarparts(4)
plot(dbRDA.MEM.crude.varpart)


#testing individual parameters
env.dbrda.select= envfit(rda.data.dist, cbind(technical.select.db, env.select.db, loca.select.db, behav.select.db), na.rm=TRUE) #testin
env.dbrda= envfit(rda.data.dist, cbind(technical, env, loca, behav), na.rm=TRUE) #testing individual parameters


#new adonis test
if (FALSE){
adonis.result.db = adonis(rda.data ~., cbind(env.select.db, loca.select.db, behav.select.db, technical.select.db), perm = 1999, method = "bray")
write.table(adonis.result.db$aov.tab, "~/Documents/Bioinfo/DNAformal/plots/supplementary/allsamples.adonis.db.csv", sep = ",", col.names = TRUE, row.names = TRUE, quote = FALSE)

adonis.crude.result.db = adonis(rda.data ~., cbind( env, loca, behav, technical), perm = 1999, method = "bray")
write.table(adonis.crude.result.db$aov.tab, "~/Documents/Bioinfo/DNAformal/plots/supplementary/allsamples.adonis.crude.db.csv", sep = ",", col.names = TRUE, row.names = TRUE, quote = FALSE)
}

if (FALSE){
#total correlation
env.recoding = cbind(technical, env, loca, behav)
res2 = rcorr(as.matrix(env.recoding), type = "spearman")

#takes output from rcorr function, adjusting p value matrix accroding to fdr principles
matrixPadjust = function(res, select = c(1:nrow(res$P))){
  print(select)
  temp.p = c(res$P)
  temp.p[is.na(temp.p)] = 0
  temp.p.adj = p.adjust(temp.p, method = "fdr")
  p.adj.matrix = matrix(temp.p.adj, nrow = ncol(res$P), ncol=ncol(res$P), dimnames = list(rownames = colnames(res$P), colnames = colnames(res$P)))
  return(p.adj.matrix)
}
adjusted.res2.p = matrixPadjust(res2)
#pls package also has a function called corrplot
corrplot::corrplot(res2$r, order = "hclust", p.mat = adjusted.res2.p, sig.level = 0.01, insig = "blank", diag = FALSE, tl.cex=0.5, addrect = 12, method="color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black")
}


#only mike
#rda.data.mike = t(Mike.master.cpm)
rda.data.mike = t(Mike.master.asinh.cpm)
rda.env.mike = rda.env[rda.env$ownership=="Mike",]
#rda.data.campus = t(campushome.master.cpm)
#rda.data.campus = t(campushome.master.asinh.cpm)
#rda.env.campus = rda.env[rda.env$ownership=="Mike" & rda.env$location == "Campus",]

#only p2
#rda.data.patient1 = t(patient1.master.cpm)
rda.data.patient1 = t(patient1.master.asinh.cpm)
rda.env.patient1 = rda.env[rda.env$aownership=="P2",]

names(environ)[apply(environ, 2, function(x){ sum(!is.na(x)) >= 250})] #this tests to see how complete each feature is

#use this to justify the use of RDA PCA instead of CCA and CA
dca.result = decorana(rda.data, iweigh=1, ira=0)
dca.result
plot(dca.result, display="sites")

#dbRDA
#condition out seems not as useful as the var partition function

rda.data.dist = vegdist(rda.data,method = "bray",na.rm = TRUE, diag = FALSE)
#stress plot
rda.data.mds0 = isoMDS(rda.data.dist)
stressplot(rda.data.mds0, rda.data.dist)

#items = c("MeanDew_PointC", "median.particle", "durationOut", "elevation.m.", "mNDVI", "Overall.AQI.Value", "latitude", "longitude", "Population_density_people_per_sqmi", "X_Mean_Wind_SpeedKm.h", "mFPAR", "dPM2.5", "dPM10", "mLAI",   "X_Mean_VisibilityKm" ,"X_Mean_Sea_Level_PressurehPa", "population2")

if (FALSE) (#old codes
#adonis test
adonis.result = adonis(rda.data ~ season + median.particle + Mean_TemperatureC + MeanDew_PointC + X_Mean_Humidity + X_Mean_VisibilityKm + X_Mean_Sea_Level_PressurehPa + X_Mean_Wind_SpeedKm.h + is_there_rain + Precipitationmm  + Overall.AQI.Value + dPM2.5 +dPM10  + mLAI + mFPAR + mNDVI + aownership + location + geo2 + latitude + longitude + durationOut + weekend + hiking + Population_density_people_per_sqmi + urban_rural + elevation.m. + population2 + batch, rda.env, perm = 1999, method = "bray")
write.table(adonis.result$aov.tab, "~/Documents/Bioinfo/DNAformal/plots/supplementary/allsamples.adonis.csv", sep = ",", col.names = TRUE, row.names = TRUE, quote = FALSE)

#adonis test for mike
adonis.result.p1 = adonis(rda.data.mike ~ season + median.particle + Mean_TemperatureC + MeanDew_PointC + X_Mean_Humidity + X_Mean_VisibilityKm + X_Mean_Sea_Level_PressurehPa + X_Mean_Wind_SpeedKm.h + is_there_rain + Precipitationmm  + Overall.AQI.Value + dPM2.5 +dPM10  + mLAI + mFPAR + mNDVI  + location + geo2 + latitude + longitude + durationOut + weekend + hiking + Population_density_people_per_sqmi + urban_rural + elevation.m. + population2 + batch, rda.env.mike, perm = 999, method = "bray")

#for p2
adonis.result.p2 = adonis(rda.data.patient1 ~ season + median.particle + Mean_TemperatureC + MeanDew_PointC + X_Mean_Humidity + X_Mean_VisibilityKm + X_Mean_Sea_Level_PressurehPa + X_Mean_Wind_SpeedKm.h + is_there_rain + Precipitationmm  + Overall.AQI.Value + dPM2.5 +dPM10  + mLAI + mFPAR + mNDVI  + location + geo2 + latitude + longitude + durationOut + Population_density_people_per_sqmi + urban_rural + elevation.m. + population2 + batch, rda.env.patient1, perm = 999, method = "bray")

#betadisper test, test of variances homogeinity. Similar to Levene's test for anova.
#Its best if groups have homogenous dispersion (variances), so their differences if significant can be attributed to their centroid (mean)
beta = with(rda.env, betadisper(rda.data.dist, season))
anova(beta)
TukeyHSD(beta)
}

```

```{R RDA temp}
RDA = rda(rda.data ~ geo2 + location + ownership + duration + season + median.particle + Mean_TemperatureC + X_Mean_Humidity + batch + durationOut + weekend + MeanDew_PointC +  X_Mean_VisibilityKm , rda.env, na.action = na.exclude)

RDA.mike = rda(rda.data.mike ~ geo2 + location +  duration + season + median.particle + Mean_TemperatureC + X_Mean_Humidity + batch + durationOut + weekend + MeanDew_PointC +  X_Mean_VisibilityKm , rda.env.mike, na.action = na.exclude)

RDA.mike = rda(rda.data.mike) #without env this is just like PCA


#selecting model or RDA
#envfit analysis
env.mike = envfit(RDA.mike, rda.env.mike, na.rm=TRUE)

plot(RDA.mike, display = "sites")
plot(env.mike, p.max=0.1)
tmp = with(rda.env.mike, ordisurf(RDA.mike, season, add =TRUE))
with(rda.env.mike, ordisurf(RDA.mike, seasonsummer, add = TRUE, col="green4"))

mod.mike = varpart(rda.data.mike, ~ season + weekend + Mean_TemperatureC+ X_Mean_Humidity+ is_there_rain + Precipitationmm , ~ location, ~batch, data = rda.env.mike, na.rm=TRUE)


mod = varpart(rda.data, ~ season + weekend + Mean_TemperatureC+ X_Mean_Humidity+ is_there_rain + Precipitationmm, ~ location+ownership, ~batch, data = rda.env)

RDA.nolocation = rda(rda.data ~ geo2  + ownership + duration + season + total.particle + temperature + humid + batch, rda.env, na.action = na.exclude)
RDA.noseason = rda(rda.data ~ geo2 + ownership + duration  + total.particle + temperature + humid + batch, rda.env, na.action = na.exclude)
#season contributes 61.35 variances explained, total variances about 2417.97 
RDA = rda(rda.data.campus ~ location + duration + season  + total.particle + temperature + humid + batch, rda.env.campus,na.action = na.exclude)
RDA = rda(rda.data.patient1 ~ location + duration + season  + total.particle + temperature + humid + batch, rda.env.patient1,na.action = na.exclude)
RDA.2 = rda(rda.data ~ duration + temperature + season  + total.particle + humid + batch, rda.env, na.action=na.exclude)
RDA.2 = rda(rda.data.campus ~ duration + temperature + season  + total.particle + humid + batch, rda.env.campus, na.action=na.exclude)
RDA.2 = rda(rda.data.patient1 ~ duration + temperature + season  + total.particle + humid + batch, rda.env.patient1, na.action=na.exclude)
anova(RDA)
plot(RDA)
anova(RDA.2)
plot(RDA.2)


#meta plots for environmental data
#reoder levels for legend
rda.env$season = factor(rda.env$season, levels=c("spring", "summer", "fall", "winter"))
rda.env.mike$season = factor(rda.env.mike$season, levels=c("spring", "summer", "fall", "winter"))
rda.env.campus$season = factor(rda.env.campus$season, levels=c("spring", "summer", "fall", "winter"))
rda.env.patient1$season = factor(rda.env.patient1$season, levels=c("spring", "summer", "fall", "winter"))
#function that takes in vector of data and a coefficient,
#returns boolean vector if a certain point is an outlier or not
check_outlier <- function(v, coef=1.5){
  quantiles <- quantile(v,probs=c(0.25,0.75))
  IQR <- quantiles[2]-quantiles[1]
  res <- v < (quantiles[1]-coef*IQR)|v > (quantiles[2]+coef*IQR)
  return(res)
}


label = rep("", dim(rda.env)[1])
label[check_outlier(rda.env$temperature)] = as.character(rda.env$location[check_outlier(rda.env$temperature)])
####this plot is kinda tricky, pay attention to parent-daughter function inheritence
p = ggplot(rda.env.patient1, aes(x = date.month, y = temperature))
p + stat_smooth() + geom_boxplot(aes(group = date.month, color=season)) + geom_point(aes(color=season)) +scale_x_discrete(limits=c(1:12)) 

p = ggplot(rda.env.patient1, aes(x = date.month, y = temperature))
p + stat_smooth() + geom_point(aes(color=season)) +scale_x_discrete(limits=c(1:12)) 

label = rep("", dim(rda.env)[1])
label[check_outlier(log(rda.env$particle))] = as.character(rda.env$location[check_outlier(log(rda.env$particle))])

p = ggplot(rda.env, aes(x = season, y = log(particle), color=season, size = sd.particle, group = season))
p + geom_boxplot() + geom_jitter(aes(color=season), width=0.2) + geom_text_repel(aes(label=label), na.rm=TRUE, size=3) + theme_grey(base_size=15)

label = rep("", dim(rda.env)[1])
label[check_outlier(rda.env$humid)] = as.character(rda.env$location[check_outlier(rda.env$humid)])

p = ggplot(rda.env.patient1, aes(x = date.month, y = humid))
p + stat_smooth() + geom_boxplot(aes(group=date.month, color=season)) + geom_point(aes(color=season)) +scale_x_discrete(limits=c(1:12)) #+ geom_text_repel(aes(label=label), na.rm=TRUE, size=3) 

p = ggplot(rda.env.patient1, aes(x = date.month, y = humid))
p + stat_smooth()  + geom_point(aes(color=season)) +scale_x_discrete(limits=c(1:12)) #+ geom_text_repel(aes(label=label), na.rm=TRUE, size=3) 

#apply this to our data
check_outlier(rda.env$duration)#this checks the outlier of the vector
label = rep("", dim(rda.env)[1])
label[check_outlier(rda.env$duration)] = as.character(rda.env$ownership[check_outlier(rda.env$duration)])
p = ggplot(rda.env, aes(x = season, y = duration, color=season))
p + geom_boxplot() + geom_jitter() + geom_text_repel(aes(label=label), na.rm=TRUE)
```
##GLM for every feature
```{r GLM for every feature}
#library(hier.part) #can only take 13 variables, TRASH
library(MuMIn)
library(hier.part)
library(relaimpo)
library(ggtern)
library(ggalt)

if(FALSE){
#direct test based only on owner for four people data
sss = cbind.data.frame(t(fourpeople.select.master.asinh.cpm), owner = fourpeople.select$aownership, location = fourpeople.select$location)
sss.melt = melt(sss, id.vars = c("owner", "location"))
sss.results = compare_means(formula = value ~ location, data = sss.melt, group.by = "variable", p.adjust.method = "fdr", method= "kruskal.test" )

#examining patterns for the four people study
 ggplot(sss, aes(x=location,  y=Alkanindiges, label=owner)) + geom_boxplot() + geom_beeswarm() + geom_text_repel() + stat_compare_means()
}

### Three groups technical, env, locabehav
#all.select.db 
imp.data = rda.data[ , order(colSums(rda.data),decreasing=TRUE)] 
imp.data = imp.data[, apply(imp.data, 2, function(x){ sum(x>0)>=100})] #occuring in more than 100 samples
imp.df = data.frame(matrix(nrow=12))
imp.rsq = c()
imp.adjrsq = c()
imp.p = c()

#can only work on 12 variables or less so far, hence cutoff set at 12, the last few are the least important anyway.
if(FALSE){
for (name in colnames(imp.data)){
  print(name)
  hier.results = hier.part(rda.data[,name], all.select.db[,1:12], barplot = "FALSE", gof ="Rsqu")
  lm.fit = lm(rda.data[,name]~., data=all.select.db[,1:12])
  lm.summary = summary(lm.fit)
  imp.rsq = c(imp.rsq, lm.summary$r.squared)
  imp.adjrsq = c(imp.adjrsq, lm.summary$adj.r.squared)
  imp.df = cbind(imp.df, hier.results$I.perc)
  print(hier.results$I.perc)
  imp.p = c(imp.p, pf(lm.summary$fstatistic[1], lm.summary$fstatistic[2], lm.summary$fstatistic[3], lower.tail=F))
}
imp.df = imp.df[, -1]
colnames(imp.df) = colnames(imp.data)
imp.df.final = rbind(imp.df, R2 = imp.rsq, adjR2 = imp.adjrsq, p = imp.p)
imp.df.final = rbind(imp.df.final, padjust = p.adjust(imp.df.final["p",], method="fdr"))
imp.df.fin = t.data.frame(imp.df.final)

write.csv(file = "~/Documents/Expo_paper/fvarpart.csv", x = imp.df.fin)
}


#using calc.relimp from the relaimpo package, no limitation on the number of testing variables imposed
relaimp.df = data.frame(matrix(nrow=ncol(all.select.db)))
relaimp.rsq = c()
relaimp.adjrsq = c()
relaimp.p = c()
relaimp.var.y = c()
for (name in colnames(imp.data)){
  name="Stereum"
  print(name)
  lm.fit = lm(rda.data[,name]~., data=all.select.db)
  lm.rela = calc.relimp(lm.fit, type = c("lmg"), rela = TRUE) #this function actually has a groups option to combine variables into groups
  relaimp.df = cbind(relaimp.df, lm.rela@lmg)
  relaimp.var.y = c(relaimp.var.y, lm.rela@var.y)
  lm.summary = summary(lm.fit)
  relaimp.rsq = c(relaimp.rsq, lm.summary$r.squared)
  relaimp.adjrsq = c(relaimp.adjrsq, lm.summary$adj.r.squared)
  relaimp.p = c(relaimp.p, pf(lm.summary$fstatistic[1], lm.summary$fstatistic[2], lm.summary$fstatistic[3], lower.tail=F))
}

relaimp.df = relaimp.df[, -1]
colnames(relaimp.df) = colnames(imp.data)
relaimp.df = relaimp.df * 100
relaimp.df.final = rbind(relaimp.df, R2 = relaimp.rsq, adjR2 = relaimp.adjrsq, p = relaimp.p, yvar = relaimp.var.y)
relaimp.df.final = rbind(relaimp.df.final, padjust = p.adjust(relaimp.df.final["p",], method="fdr"))
relaimp.df.fin = t.data.frame(relaimp.df.final)
relaimp.df.fin = as.data.frame(relaimp.df.fin)

write.csv(file = "~/Documents/Expo_paper/fvarpart_final_2.csv", x = relaimp.df.fin)

env.index = c(3,4,6,7,9,11,14,17)
locabehav.index = c(2,5,8,10,12,13,15,16)
tech.index = c(1)
env.imp = apply(relaimp.df.fin, 1, function(x){sum(x[env.index])})
locabehav.imp = apply(relaimp.df.fin, 1, function(x){sum(x[locabehav.index])})
tech.imp = apply(relaimp.df.fin, 1, function(x){sum(x[tech.index])})
r2 = as.numeric(relaimp.df.fin$R2)
input = read.csv("~/Documents/Expo_paper/fvarpart_final.csv") #contains taxonomy info, otherwsie same to fvarpart_final_2.csv
taxonomy = input$tax
tern.df.raw = cbind.data.frame(Env = as.numeric(env.imp), Spa = as.numeric(locabehav.imp), Tec = as.numeric(tech.imp), R2 = r2, Padj = relaimp.df.fin$padjust, taxonomy = taxonomy)
rownames(tern.df.raw) = rownames(relaimp.df.fin)
tern.df.raw = tern.df.raw[as.numeric(tern.df.raw$Padj)< 0.05,]

tern.df = cbind(Env= as.numeric(env.imp)/100*r2, Spa=as.numeric(locabehav.imp)/100*r2, Tec=as.numeric(tech.imp)/100*r2, R2 = r2, Padj=relaimp.df.fin$padjust, taxonomy = taxonomy)
rownames(tern.df) = rownames(relaimp.df.fin)
tern.df = as.data.frame(tern.df)
tern.df = tern.df[as.numeric(tern.df$Padj)< 0.05,]

tern.df[,1:5] = apply(tern.df[,1:5], 2, as.numeric)
tern.df$taxonomy = factor(tern.df$taxonomy, levels = c("bacteria",  "fungi", "plant", "animal"))
tern.df$dominating = rep("norm", nrow(tern.df))
tern.df$dominating[tern.df$Env/tern.df$Spa>=1.5] = "Env"
tern.df$dominating[tern.df$Spa/tern.df$Env>=1.5] = "Spa"
tern.melt = melt(tern.df, id.vars=c("Env", "Spa", "Tec", "R2","Padj", "dominating"))
tern.melt$value = factor(tern.melt$value, levels = c("bacteria",  "fungi", "plant", "animal"))
tern.melt$dominating = factor(tern.melt$dominating, levels = c("norm", "Env", "Spa"))

#original
ggtern(tern.melt,aes(Env,Spa,Tec, size=R2, color=value, group=value, fill=value)) + 
  geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) +
  geom_point(alpha=0.5) +
  #geom_encircle(alpha=0.1,size=1) +
  theme_bw(base_size = 20)  +
   theme_showarrows() + 
  theme_notitles() +
  guides(shape=FALSE) +
  scale_color_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))+
  scale_fill_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))+
   facet_wrap( ~ value,scales="free", ncol=4) 

#introducing dominating influences
ggtern(tern.melt,aes(Env,Spa,Tec, size=R2, color=dominating, group=value)) + 
  geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) +
  geom_point(alpha=0.5) +
  #geom_encircle(alpha=0.1,size=1) +
  theme_bw(base_size = 20)  +
   theme_showarrows() + 
  theme_notitles() +
  guides(shape=FALSE) +
  scale_color_manual(values=c("#F8766D","#00AFBB", "#E7B800", "#00BFC4"))+
  scale_fill_manual(values=c("#F8766D", "#00AFBB", "#E7B800", "#00BFC4"))+
   facet_wrap( ~ value,scales="free", ncol=4) 


#plot all
ggtern(tern.melt,aes(Env,Spa,Tec, color=dominating, size=R2, shape = value)) + 
  #geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) +
  geom_point(alpha=0.5) +
  #geom_encircle(alpha=0.1,size=1) +
  theme_bw(base_size = 20)  +
   theme_showarrows() + 
  theme_notitles() +
  scale_color_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))+
  scale_fill_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))
   #facet_wrap( ~ value,scales="free", ncol=4) 

#seems that have to do variance partition the hardcore way, planning to use relative of importance per variables as the value to calculate variance partitioning
  
#individual testing
fungi.thre = quantile(tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$R2, 0.5)
sum(tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$Env <= 1 & tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$R2>=fungi.thre) #Spa < env
View(subset(tern.df.raw, taxonomy == "fungi" & Env/Spa >= 1.5 & R2>= fungi.thre))

sum(tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$Env >= 1 & tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$R2>=fungi.thre) #Spa > env
View(subset(tern.df.raw, taxonomy == "fungi" & Spa/Env >= 1.5 & R2>= fungi.thre))

chisq.test(x=rbind(c(72/2,72/2),c(50,22))) #testing

bac.thre = quantile(tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$R2, 0.5)
sum(tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$Env <= 1 & tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$R2>=bac.thre) #Spa < env
View(subset(tern.df.raw, taxonomy == "bacteria" & Env/Spa >= 1.5 & R2>= bac.thre))

sum(tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$Env >= 1 & tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$R2>=bac.thre) #Spa > env
View(subset(tern.df.raw, taxonomy == "bacteria" & Spa/Env >= 1.5 & R2>= bac.thre))

chisq.test(x=rbind(c(21/2,21/2),c(13,8))) 

plant.thre = quantile(tern.df.raw[tern.df.raw$taxonomy=="plant", ]$R2, 0.5)
sum(tern.df.raw[tern.df.raw$taxonomy=="plant", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="plant", ]$Env <= 1 & tern.df.raw[tern.df.raw$taxonomy=="plant", ]$R2>=plant.thre) #Spa < env
View(subset(tern.df.raw, taxonomy == "plant" & Env/Spa >= 1.5))

sum(tern.df.raw[tern.df.raw$taxonomy=="plant", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="plant", ]$Env >= 1 & tern.df.raw[tern.df.raw$taxonomy=="plant", ]$R2>=plant.thre) #Spa > env
View(subset(tern.df.raw, taxonomy == "plant" & Spa/Env >= 1.5))

```
##GLM test bootstrapping
```{r GLM for every feature}
#library(hier.part) #can only take 13 variables, TRASH
library(MuMIn)
library(hier.part)
library(relaimpo)
library(ggtern)
library(ggalt)

### Three groups technical, env, locabehav
#all.select.db 
imp.data = rda.data[ , order(colSums(rda.data),decreasing=TRUE)] 
imp.data = imp.data[, apply(imp.data, 2, function(x){ sum(x>0)>=100})] #occuring in more than 100 samples
#using calc.relimp from the relaimpo package, no limitation on the number of testing variables imposed
cicalc = function(delta, ref){ #90% confidence interval
  delta = sort(delta)
  range = quantile(delta, c(0.05, 0.95))
  ratio.ci = as.numeric(ref) - as.numeric(c(range[2], range[1]))
  names(ratio.ci) = c("5%", "95%")
  names(ref) = "reference"
  return(c(ref, ratio.ci))
}
ecdf_fun = function(x,perc) {ecdf(x)(perc)} #retrun percentile given value in a distribution


#This part takes a lot of time, about 20 hours, result is saved as a rds object
relaimp.df = data.frame(matrix(nrow=3))
relaimp.rsq = c()
relaimp.adjrsq = c()
relaimp.p = c()
relaimp.var.y = c()
ratio.master = list()

for (name in as.character(input$X[input$padjust<0.05])){ #only on genera that occured in more than 100 samples
  #name="Stereum"
  print(name)
  lm.fit = lm(rda.data[,name]~., data=all.select.db)
  lm.rela = calc.relimp(lm.fit, type = c("lmg"), rela = TRUE, groups = list(env= c(3,4,6,7,9,11,14,17)+1, spa=c(2,5,8,10,12,13,15,16)+1, tech=c(1)+1)) #this function actually has a groups option to combine variables into groups
  #group 1 is env, group 2 is spa, group 3 is tech
  reference.temp = lm.rela@lmg
  #relaimp.df = cbind(relaimp.df, lm.rela@lmg)
  relaimp.var.y = c(relaimp.var.y, lm.rela@var.y)
  lm.summary = summary(lm.fit)
  relaimp.rsq = c(relaimp.rsq, lm.summary$r.squared)
  relaimp.adjrsq = c(relaimp.adjrsq, lm.summary$adj.r.squared)
  refratio = reference.temp[1]/reference.temp[2]
  refenv = reference.temp[1]
  refloc = reference.temp[2]
  reftech = reference.temp[3]
  
  data.slice = rda.data[,name]
  set.seed = 123
  bootstrapping.matrix = replicate(9999, sample(1:283, 283, replace = TRUE)) #each COLUMN is a sampling
  relaimp.df.temp = data.frame(matrix(nrow=3))
  #relaimp.var.y.temp = c()
  for (i in 1:ncol(bootstrapping.matrix)){
    #print(i)
    rda.data.temp = data.slice[bootstrapping.matrix[,i]]
    all.select.db.temp = all.select.db[bootstrapping.matrix[,i],]
    lm.fit.temp = lm(rda.data.temp~., data=all.select.db.temp)
    lm.rela.temp = calc.relimp(lm.fit.temp, type = c("lmg"), rela = TRUE, groups = list(env= c(3,4,6,7,9,11,14,17)+1, spa=c(2,5,8,10,12,13,15,16)+1, tech=c(1)+1))  #this function actually has a groups option to combine variables into groups
    relaimp.df.temp = cbind(relaimp.df.temp, lm.rela.temp@lmg)
    #relaimp.var.y.temp = c(relaimp.var.y.temp, lm.rela.temp@var.y)
  }
  relaimp.df.temp = relaimp.df.temp[,-1]
  resample.ratio = unlist(relaimp.df.temp[1,]/relaimp.df.temp[2,])
  resample.env = unlist(relaimp.df.temp[1,])
  resample.loc = unlist(relaimp.df.temp[2,])
  resample.tech = unlist(relaimp.df.temp[3,])
  
  deltaratio = refratio - resample.ratio
  deltaenv =refenv - resample.env
  deltaloc = refloc - resample.loc
  deltatech = reftech - resample.tech
  
  #range = quantile(deltaratio.sort, c(0.05, 0.95))
  #ratio.ci = as.numeric(refratio) - as.numeric(c(range[2], range[1]))
  #names(ratio.ci) = c("5%", "95%")
  #names(refratio) = "reference"
  #ratio.ci
  ratio.master[[name]] = data.frame(ratio = cicalc(deltaratio, refratio),
                                    env = cicalc(deltaenv, refenv),
                                    loc = cicalc(deltaloc, refloc),
                                    tech = cicalc(deltatech, reftech),
                                    onequantile = ecdf_fun(resample.ratio,1) #quantile of probability of env/loc <= 1
                                    )
}

#saveRDS(object = ratio.master, file = "~/Documents/Revision2/bootstrapping.rds")
ratio.master = readRDS(file =  "~/Documents/Revision2/bootstrapping.rds")

#extracting all information pertaining different group of variables
moreenv = c()
moreloc = c()
rest = c()
allquantile = c()
allratio = c()
allratioupper = c()
allratiolower = c()
allenv = c()
allenvupper = c()
allenvlower = c()
allloc = c()
alllocupper = c()
allloclower = c()
alltech = c()
alltechupper = c()
alltechlower = c()
#chance of observing env/loc <= 1 in bootstrapping samples
for(speciesname in names(ratio.master)){
  allquantile = c(allquantile, ratio.master[[speciesname]]$onequantile[1])
  allratio = c(allratio, ratio.master[[speciesname]]$ratio[1])
  allratiolower = c(allratiolower, ratio.master[[speciesname]]$ratio[2])
  allratioupper = c(allratioupper, ratio.master[[speciesname]]$ratio[3])
  
  allenv = c(allenv, ratio.master[[speciesname]]$env[1])
  allenvlower =c(allenvlower, ratio.master[[speciesname]]$env[2])
  allenvupper = c(allenvupper, ratio.master[[speciesname]]$env[3])
  
  allloc = c(allloc, ratio.master[[speciesname]]$loc[1])
  allloclower =c(allloclower, ratio.master[[speciesname]]$loc[2])
  alllocupper = c(alllocupper, ratio.master[[speciesname]]$loc[3])
  
   alltech = c(alltech, ratio.master[[speciesname]]$tech[1])
  alltechlower =c(alltechlower, ratio.master[[speciesname]]$tech[2])
  alltechupper = c(alltechupper, ratio.master[[speciesname]]$tech[3])
  
  temp.value = ratio.master[[speciesname]]$onequantile[1]
  if (temp.value >= 0.8) {
    print(temp.value)
    moreloc = c(moreloc, speciesname)} else if (temp.value <= 0.2) {
      print(temp.value)
      moreenv = c(moreenv, speciesname)} else {rest = c(rest, speciesname)}
}
#relaimp.rsq = c()
#relaimp.adjrsq = c()
#relaimp.p = c()
#relaimp.var.y = c()

taxinfo = read.csv("~/Documents/Expo_paper/fvarpart_final_v3.csv") #contains phylum info
quantile.df.report = data.frame(name = names(ratio.master),  #for paper tables
                         Prob_of_spa_dominating = allquantile,#chance of observing env/loc <= 1 in bootstrapping samples
                         env_spa_ratio = allratio,
                         ratioupper = allratioupper,
                         ratiolower = allratiolower,
                         env = allenv,
                         envupper = allenvupper,
                         envlower = allenvlower,
                         spa = allloc,
                         spaupper = alllocupper,
                         spalower = allloclower,
                         tech = alltech,
                         techupper = alltechupper,
                         techlower = alltechlower,
                         rsq = relaimp.rsq, 
                         adjrsq = relaimp.adjrsq,
                         taxon = input$tax[match(names(ratio.master),input$X)]
                        )
quantile.df.report$phylum = taxinfo$phylum[match(quantile.df.report$name, taxinfo$X)]
write.csv(file = "~/Documents/Revision2/alldataCI.csv", x = quantile.df.report) #contains all relevant information for confidence interval estimation, dominating influence definition, and ternary plots

quantile.df = data.frame(name = names(ratio.master), 
                         onequantile = allquantile,#chance of observing env/loc <= 1 in bootstrapping samples
                         ratio = allratio,
                         ratioupper = allratioupper,
                         ratiolower = allratiolower,
                         rsq = relaimp.rsq, 
                         adjrsq = relaimp.adjrsq,
                         vary = relaimp.var.y,
                         taxon = input$tax[match(names(ratio.master),input$X)]
                        )
#saveRDS(quantile.df, file= "~/Documents/Revision2/quantile.df.rds") # simplied format of the quantile.df.report          
taxinfo = read.csv("~/Documents/Expo_paper/fvarpart_final_v3.csv") #contains phylum info
quantile.df$phylum = taxinfo$phylum[match(quantile.df$name, taxinfo$X)]
quantile.df = quantile.df[order(quantile.df$adjrsq,decreasing =TRUE),]
quantile.df.select = quantile.df[1:199,] #taking all significant models
#View(quantile.df.select)
View(subset(quantile.df.select, taxon == "fungi"))
#chance of observing env/loc <= 1 in bootstrapping samples
View(subset(quantile.df.select, onequantile >= 0.9 & taxon == "fungi")) #new definition of dominating envriomnetal
View(subset(quantile.df.select, onequantile <= 0.1 & taxon == "fungi")) #new definition of dominating spatial
#find quantile of certain value
View(subset(quantile.df.select, onequantile >= 0.9 & taxon == "bacteria")) #new definition of dominating envriomnetal
View(subset(quantile.df.select, onequantile <= 0.1 & taxon == "bacteria"))

View(subset(quantile.df.select, onequantile >= 0.9 & taxon == "plant")) #new definition of dominating envriomnetal
View(subset(quantile.df.select, onequantile <= 0.1 & taxon == "plant"))

View(subset(quantile.df.select, onequantile >= 0.9 & taxon == "animal")) #new definition of dominating envriomnetal
View(subset(quantile.df.select, onequantile <= 0.1 & taxon == "animal"))

#View(quantile.df[quantile.df$ratio >= 2 | quantile.df$ratio <= 1/2, ])
#View(quantile.df[quantile.df$onequantile >= 0.9 | quantile.df$onequantile <=0.1, ])
#View(quantile.df[quantile.df$onequantile >= 0.95 | quantile.df$onequantile <=0.05, ])

quantile.df.report$taxon = factor(quantile.df.report$taxon, levels = c("bacteria", "fungi", "plant", "animal")) #coordinating color
#original
#changing names of columns, only for plotting reasons
colnames(quantile.df.report)[c(6,9,12,15,16,17,18)] = c("Env", "Spa", "Tec", "R2", "Adj.R2", "Taxon", "Phylum")
ggtern(quantile.df.report,aes(Env,Spa,Tec, size=R2, color=Taxon, group=Taxon, fill=Taxon)) + 
  geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) +
  geom_point(alpha=0.5) +
  #geom_encircle(alpha=0.1,size=1) +
  theme_bw(base_size = 20)  +
   theme_showarrows() + 
  theme_notitles() +
  guides(shape=FALSE) +
  scale_color_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))+
  scale_fill_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))+
   facet_wrap( ~ Taxon,scales="free", ncol=4) 

#preparing dominating variable
quantile.df.report$Dominating = "Neither"
quantile.df.report$Dominating[quantile.df.report$Prob_of_spa_dominating>=0.9] = "Spa"
quantile.df.report$Dominating[quantile.df.report$Prob_of_spa_dominating<=0.1] = "Env"
quantile.df.report$Dominating = factor(quantile.df.report$Dominating, levels = c("Neither", "Env", "Spa"))
#introducing dominating influences
ggtern(quantile.df.report,aes(Env,Spa,Tec, size=R2, color=Dominating, group=Taxon)) + 
  geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) +
  geom_point(alpha=0.5) +
  #geom_encircle(alpha=0.1,size=1) +
  theme_bw(base_size = 20)  +
   theme_showarrows() + 
  theme_notitles() +
  guides(shape=FALSE) +
  scale_color_manual(values=c("#F8766D","#00AFBB", "#E7B800", "#00BFC4"))+
  scale_fill_manual(values=c("#F8766D", "#00AFBB", "#E7B800", "#00BFC4"))+
   facet_wrap( ~ Taxon,scales="free", ncol=4) 

#plotting each domain of life separately for better control
ggtern(subset(quantile.df.report, Taxon == "fungi"),aes(Env,Spa,Tec, size=R2, color=Dominating, group=Taxon, shape = Phylum)) + 
  geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) +
  geom_point(alpha=0.5) +
  #geom_encircle(alpha=0.1,size=1) +
  theme_bw(base_size = 20)  +
   theme_showarrows() + 
  theme_notitles() +
  guides(shape=FALSE) +
  scale_color_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))+
  scale_fill_manual(values=c("#778899", "#00AFBB", "#E7B800", "#00BFC4"))+
       scale_shape_manual(values = c(16, 18)) + 
    scale_size_continuous(limits = c(0.1, 0.4))
   #facet_wrap( ~ Taxon,scales="free", ncol=4) 

#bacteria
ggtern(subset(quantile.df.report, Taxon == "bacteria"),aes(Env,Spa,Tec, size=R2, color=Dominating, group=Taxon)) + 
  geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) +
  geom_point(alpha=0.5) +
  #geom_encircle(alpha=0.1,size=1) +
  theme_bw(base_size = 20)  +
   theme_showarrows() + 
  theme_notitles() +
  guides(shape=FALSE) +
  scale_color_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))+
  scale_fill_manual(values=c("#778899", "#00AFBB", "#E7B800", "#00BFC4"))+
      scale_shape_manual(values = c(16, 18)) + 
    scale_size_continuous(limits = c(0.1, 0.4)) #make the smaller dots bigger when sample size is smaller

#plant
ggtern(subset(quantile.df.report, Taxon == "plant"),aes(Env,Spa,Tec, size=R2, color=Dominating, group=Taxon, shape = Phylum)) + 
  geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) +
  geom_point(alpha=0.5) +
  #geom_encircle(alpha=0.1,size=1) +
  theme_bw(base_size = 20)  +
   theme_showarrows() + 
  theme_notitles() +
  guides(shape=FALSE) +
  scale_color_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))+
  scale_fill_manual(values=c("#778899", "#00AFBB", "#E7B800", "#00BFC4"))+
     scale_shape_manual(values = c(16, 18)) + 
    scale_size_continuous(limits = c(0.1, 0.3))

#ANIAMLS
ggtern(subset(quantile.df.report, Taxon == "animal"),aes(Env,Spa,Tec, size=R2, color=Dominating, group=Taxon, shape = Phylum)) + 
  geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) +
  geom_point(alpha=0.5) +
  #geom_encircle(alpha=0.1,size=1) +
  theme_bw(base_size = 20)  +
   theme_showarrows() + 
  theme_notitles() +
  guides(shape=FALSE) +
  scale_color_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))+
  scale_fill_manual(values=c("#778899", "#00AFBB", "#E7B800", "#00BFC4"))+
    scale_shape_manual(values = c(16, 18)) + 
    scale_size_continuous(limits = c(0.10, 0.35))
   #facet_wrap( ~ Taxon,scales="free", ncol=4) 


#plot all
ggtern(quantile.df.report,aes(Env,Spa,Tec, size=R2, color=dominating, shape=Taxon)) + 
  #geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) +
  geom_point(alpha=0.5) +
  #geom_encircle(alpha=0.1,size=1) +
  theme_bw(base_size = 20)  +
   theme_showarrows() + 
  theme_notitles() +
  scale_color_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))+
  scale_fill_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))

   #facet_wrap( ~ value,scales="free", ncol=4) 

#chi-sq test
#143 fungi, 60 ascomycota, 83 basidiomycota
#%80
#dominating environmental 
chisq.test(x=rbind(c(29*60/143,29*83/143),c(0,29)))
#dominating spatial
chisq.test(x=rbind(c(35*60/143,35*83/143),c(26,9))) 

#%90
#dominating env 20/20
chisq.test(x=rbind(c(20*60/143,20*83/143),c(0,20)))
#dominating spatial 19/21
chisq.test(x=rbind(c(21*60/143,21*83/143),c(19,2)))

```

##GLM test permutation test, this is to calculate P values of every measure statistics, noted the p value is 1-calculated value
```{r GLM for every feature}
#library(hier.part) #can only take 13 variables, TRASH
library(MuMIn)
library(hier.part)
library(relaimpo)
library(ggtern)
library(ggalt)

### Three groups technical, env, locabehav
#all.select.db 
imp.data = rda.data[ , order(colSums(rda.data),decreasing=TRUE)] 
imp.data = imp.data[, apply(imp.data, 2, function(x){ sum(x>0)>=100})] #occuring in more than 100 samples
#using calc.relimp from the relaimpo package, no limitation on the number of testing variables imposed

ecdf_fun = function(x,perc) {ecdf(x)(perc)} #retrun percentile given value in a distribution


#This part takes a lot of time, about 20 hours, result is saved as a rds object
#relaimp.df = data.frame(matrix(nrow=3))
#relaimp.rsq = c()
#relaimp.adjrsq = c()
#relaimp.p = c()
#relaimp.var.y = c()
perm.master = list()

for (name in as.character(input$X[input$padjust<0.05])){ #only on genera that occured in more than 100 samples
  #name="Stereum"
  print(name)
  lm.fit = lm(rda.data[,name]~., data=all.select.db)
  lm.rela = calc.relimp(lm.fit, type = c("lmg"), rela = FALSE, groups = list(env= c(3,4,6,7,9,11,14,17)+1, spa=c(2,5,8,10,12,13,15,16)+1, tech=c(1)+1)) #this function actually has a groups option to combine variables into groups
  #group 1 is env, group 2 is spa, group 3 is tech
  reference.temp = lm.rela@lmg
  #relaimp.df = cbind(relaimp.df, lm.rela@lmg)
  refratio = reference.temp[1]/reference.temp[2]
  refenv = reference.temp[1]
  refloc = reference.temp[2]
  reftech = reference.temp[3]
  data.slice = rda.data[,name]
  set.seed = 123
  bootstrapping.matrix = replicate(10000, sample(1:283, 283, replace = FALSE)) #each COLUMN is a sampling
  relaimp.df.temp = data.frame(matrix(nrow=3))
  #relaimp.var.y.temp = c()
  for (i in 1:ncol(bootstrapping.matrix)){
    #print(i)
    rda.data.temp = data.slice #only permutates data labels
    all.select.db.temp = all.select.db[bootstrapping.matrix[,i],] #only permutate data labels
    lm.fit.temp = lm(rda.data.temp~., data=all.select.db.temp)
    lm.rela.temp = calc.relimp(lm.fit.temp, type = c("lmg"), rela = FALSE, groups = list(env= c(3,4,6,7,9,11,14,17)+1, spa=c(2,5,8,10,12,13,15,16)+1, tech=c(1)+1))  #this function actually has a groups option to combine variables into groups
    relaimp.df.temp = cbind(relaimp.df.temp, lm.rela.temp@lmg)
    #relaimp.var.y.temp = c(relaimp.var.y.temp, lm.rela.temp@var.y)
  }
  relaimp.df.temp = relaimp.df.temp[,-1] #first column was artifical
  #permutation results
  resample.ratio = unlist(relaimp.df.temp[1,]/relaimp.df.temp[2,]) 
  resample.env = unlist(relaimp.df.temp[1,])
  resample.loc = unlist(relaimp.df.temp[2,])
  resample.tech = unlist(relaimp.df.temp[3,])
  
  ratiopercentile = ecdf_fun(resample.ratio, refratio)
  envpercentile =ecdf_fun(resample.env, refenv) #essentially 1, so lower than minimum possible p value
  locpercentile = ecdf_fun(resample.loc, refloc) #essentially 1, so lower than minimum possible p value
  techpercentile = ecdf_fun(resample.tech, reftech)
  
  #range = quantile(deltaratio.sort, c(0.05, 0.95))
  #ratio.ci = as.numeric(refratio) - as.numeric(c(range[2], range[1]))
  #names(ratio.ci) = c("5%", "95%")
  #names(refratio) = "reference"
  #ratio.ci
  perm.master[[name]] = data.frame(ratioperc = ratiopercentile,
                                    env = envpercentile,
                                    loc = locpercentile,
                                    tech = techpercentile
                                    )
  
}
saveRDS(object = perm.master, file = "~/Documents/Revision2/permutation.rds")
perm.master.matrix = matrix(unlist(perm.master), ncol=4)
rownames(perm.master.matrix) = names(perm.master)
colnames(perm.master.matrix) = c("ratioperc", "env", "loc", "tech")
```

A remarkable feature of the `glmnet` function is that it fits the model not only for one choice of $\lambda$, but for all possible $\lambda$s at once. For now, let's look at the prediction performance for, say, $\lambda=0.04$. The name of the function parameter is `s`:
```{general_nested_cv_function}
#data.frame has features as rows and samples as columns
cvnested = function(data.frame, k, p){ #data.frame to train on, k is for outter fold, p is for inner fold
  print(ncol(data.frame))
  flds <- createFolds(1:ncol(data.frame), k = k, list = TRUE, returnTrain = FALSE)
  #print(length(flds))
  for (i in 1:length(flds)){#outter
    test.outter = data.frame[, c(flds[[i]])]
    #print(flds[[i]])
    train.outter = data.frame[, -c(flds[[i]])]
    #print(dim(train.outter))
    
    #the following part is needed for models without CV builtin to evaluate model parameter, such like lambda in glmnet, glmnet for example would not need this step
    flds.inner = createFolds(1:ncol(train.outter), k = p, list = TRUE, returnTrain = FALSE)
    for (j in 1:length(flds.inner)){#inner
      test.inner = train.outter[, c(flds.inner[[j]])]
      #print(flds.inner[[j]])
      train.inner = train.outter[, -c(flds[[j]])]
      #print(dim(train.inner))
    }#inner
    
    #test model selected by inner on the outter test set
    
  }#outter
  #fit the whole dataset to modle chosen, final model 
}
cvnested(campushome.master.asinh.cpm, 10,10)
```
##ML for season======================Machine learning for season
```{r formal_glmnet_model_validation}
#data.frame has features as rows and samples as columns
#check model performance function, arguments are vectors of predicted and actual labels
check.model.performance = function(predicted.class, actual.class){
  
  result.tbl = as.data.frame(table(as.character(predicted.class),as.character(actual.class))) 
  colnames(result.tbl)[1:2] = c("Pred","Truth")
  #print(result.tbl)
  control = 0  
  ##MACRO averaging here
  for (class in unique(result.tbl$Truth) ){
    control = control + 1
    #print(class)
    tp = sum(result.tbl[result.tbl$Pred==class & result.tbl$Truth==class, "Freq"])
    tpandfp = sum(result.tbl[result.tbl$Pred == class , "Freq" ]) #TP plus FP
    fp = tpandfp - tp
    tpandfn = sum(result.tbl[result.tbl$Truth == class , "Freq" ]) #TP plus FN
    fn = tpandfn - tp
    tn = sum(result.tbl[result.tbl$Truth != class & result.tbl$Pred != class, "Freq"])
    #print(tn)
    presi = tp/tpandfp 
    rec = tp/tpandfn
    accuracy = (tp + tn)/(tp+tn+fp+fn)
    spec = tn/(tn + fp)
    F.score = 2*presi*rec/(presi+rec)
    MCC = ((tp*tn)-(fn*fp))/sqrt((tp+fn)*(tn+fp)*(tp+fp)*(tn+fn))
    if (control == 1 ) metrics = tibble(class = as.character(class), 
                                     presi = as.numeric(presi),
                                     recall = as.numeric(rec),
                                     accuracy = as.numeric(accuracy),
                                     spec = as.numeric(spec),
                                     F.score = as.numeric(F.score),
                                     MCC = as.numeric(MCC))
    if (control > 1 ) metrics = rbind(metrics,c(as.character(class), as.numeric(presi), as.numeric(rec), as.numeric(accuracy), as.numeric(spec), as.numeric(F.score), as.numeric(MCC)))
  }

  return(metrics)
}


#cvnest function, p is not used for glmnet predictions
cvnested = function(data.frame, truth.frame, external.frame, external.truth, k, p){ #data.frame to train on, k is for outter fold, p is for inner fold
  print(ncol(data.frame))
  flds <- createFolds(1:ncol(data.frame), k = k, list = TRUE, returnTrain = FALSE)
  auc = c()
  predict.values.df = data.frame(spring=c(), summer=c(), fall=c(), winter=c())
  truth.values = c()
  #print(length(flds))
  for (i in 1:length(flds)){#outter
    test.outter = data.frame[, c(flds[[i]])]
    truth.test.outter = truth.frame$season[c(flds[[i]])]
    #print(truth.test.outter)
    #print(flds[[i]])
    train.outter = data.frame[, -c(flds[[i]])]
    truth.train.outter = truth.frame$season[-c(flds[[i]])]
    fraction = table(truth.train.outter)/length(truth.train.outter)
    weights = 1 - fraction[as.character(truth.train.outter)]
    #print(truth.train.outter)
    #print(dim(train.outter))
    #cv.glmnet takes care of inner loop training
    cvglmfit = cv.glmnet(x = t(train.outter),
                y = factor(truth.train.outter),
                     family = "multinomial", weights = weights)
    s0 = cvglmfit$lambda.1se
    #hyper-parameter
    #fit train dataset with given hyper-parameter
    glmfit = glmnet(x = t(train.outter),
                y = factor(truth.train.outter),
                family = "multinomial")
    #predict on testing dataset
    pred = predict(glmfit, t(test.outter),
                   type = "class", s = s0)
    pred.values = predict(glmfit, t(test.outter),
                   type = "response", s = s0)
    pred.values.df = data.frame(pred.values)
    colnames(pred.values.df) = c("spring", "summer", "fall", "winter")
    predict.values.df = rbind(predict.values.df, pred.values.df)

    truth.values=c(truth.values, truth.test.outter)
    confusion_table = table(predicted = pred, truth = factor(truth.test.outter))
    #print(confusion_table)
    mROC = multiclass.roc(as.numeric(factor(truth.test.outter)), as.numeric(factor(pred[,1])))
    #print(auc(mROC))
    auc = c(auc, mROC$auc)
    #test model selected by inner to the outter test set
  }#outter
  #print(dim(predict.values.df))
  #print(rownames(predict.values.df))
  #print(truth.values) #this is not necessary as the truth remains to be the truth
    #reodering the dataframe to be consistent between resamplings.
    predict.values.df = predict.values.df[order(rownames(predict.values.df)),]
  
  #fit build model on whole dataset and testing on external dataset
    fraction = table(truth.frame$season)/length(truth.frame$season)
    weights = 1 - fraction[as.character(truth.frame$season)]
    cvglmfit.final = cv.glmnet(x = t(data.frame),
                y = factor(truth.frame$season),
                     family = "multinomial", weights = weights)
    s0.model = cvglmfit.final$lambda.1se
    glmfit.final = glmnet(x = t(data.frame),
                y = factor(truth.frame$season),
                family = "multinomial")
    #use on external dataset
    print(paste0("the s0 used is ", s0.model))
    pred.external = predict(glmfit.final, t(external.frame), type = "class", s=s0.model)
    confusion_table.external = table(predicted = factor(pred.external, levels=c("spring", "summer", "fall","winter")), truth = factor(external.truth$season))
    print(confusion_table.external)
    metrics = check.model.performance(factor(pred.external, levels=c("spring", "summer", "fall","winter")),factor(external.truth$season))
    #print(metrics)
    mROC.external = multiclass.roc(as.numeric(factor(external.truth$season)), as.numeric(factor(pred.external[,1])))
    print(mROC.external$auc)
  return(list(auc = auc, metrics=metrics, predict.values=predict.values.df))
}

#auc.result = cvnested(campushome.master.asinh.cpm, campushome, 10,5)
performance.result.cv = replicate(10,cvnested(Mike_NAmerica.master.asinh.cpm, Mike_NAmerica, patient1.master.asinh.cpm, Patient1, 10,5)) 
performance.result.cv = replicate(2,cvnested(Mike_NAmerica.master.asinh.cpm, Mike_NAmerica, patient1.master.asinh.cpm, Patient1, 10,5)) 
#saveRDS(performance.result.cv, "performance.result.rds")
#this combination seems to make sense so far, with mAUC around 0.7 for internal CV and external datasets.

q = ggplot(data.frame(auc=unlist(c(performance.result.cv[1,]))), aes(x="glmnet", y=auc))+
  geom_violin(width=0.5, fill="light blue",draw_quantiles = c(0.25, 0.5, 0.75))+
  geom_quasirandom(alpha=0.6, width = 0.1 ) + ylim(c(0,1)) + 
  annotate("text", x=1, y=0.3, label = paste0("mean mAUC = ", signif(mean(unlist(c(performance.result.cv[1,]))), 3))) +
   annotate("text", x=1, y=0.25, label = paste0("median mAUC = ", signif(median(unlist(c(performance.result.cv[1,]))), 3))) +scale_color_npg()+ ylab("mAUC")  + theme_pubr() + theme( axis.text.x=element_blank()) + xlab("Glmnet")
q = q + theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14))


#plotting other metrics, by MACRO average over classes
wtf =lapply(performance.result.cv[2,], function(x){apply(x[,-1], 2, function(x) { mean(as.numeric(x))})})
wtf.matrix = matrix(unlist(wtf), nrow=10, byrow=TRUE, dimnames = list(c(1:10),c("Precison","Sensitivity","Accuracy", "Specificity", "F.score", "MCC")))
wtf.melt = melt(wtf.matrix)

p = ggplot(wtf.melt, aes(x=Var2, y=value, color=Var2))  + geom_quasirandom(alpha=0.6, groupOnX=TRUE, bandwidth = 0.1, width=0.1, size=3)  + theme_pubr()    + theme(legend.title = element_blank()) + ylim(c(0,1))+ylab("Macro average value") +xlab("Performance metrics") + scale_color_npg()
#+ geom_violin( draw_quantiles = c(0.25, 0.5, 0.75))
p = p +theme(axis.text=element_text(size=10),
         axis.text.x = element_blank(),
        axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14))   +theme(legend.position="top")

ggplot_build(p)$data

ggarrange(q, p, ncol = 2, widths=(c(2,2)))
```
##ML for control/blank========
```{r ML for control/blank}
scores_single = scores[-grep("double|I15_2|I15_3", scores$samplenames, ignore.case=TRUE), ] 
master.asinh.cpm.cb = master.asinh.cpm[, scores_single$samplenames]
  #fit build model on whole dataset and testing on external dataset
  fraction = table(scores_single$Type)/length(scores_single$Type)
  weights = 1 - fraction[as.character(scores_single$Type)]
  cvglmfit.final = cv.glmnet(x = t(master.asinh.cpm.cb),
                y = factor(scores_single$Type),
                     family = "binomial", weights = weights)
  s0.model = cvglmfit.final$lambda.1se
  glmfit.final = glmnet(x = t(master.asinh.cpm.cb),
                y = factor(scores_single$Type),
                family = "binomial")
  pred.cb = predict(glmfit.final, t(master.asinh.cpm.cb), type = "class", s=s0.model-0.08)
  confusionMatrix(data=pred.cb, reference=scores_single$Type, mode = "everything")
```

##getting mean prediction values from the pooling prediction results of the testing data, which is generated by combining prediction results using model constructed by internal CV on each testing slice, each resampling will generate a new matrix. 
```{r getting the mean prediction values}
#initiating empty tibbles with rownames.
spring=tibble(row.names = rownames(performance.result.cv[3,1]$predict.values))
summer=tibble(row.names = rownames(performance.result.cv[3,1]$predict.values))
fall=tibble(row.names = rownames(performance.result.cv[3,1]$predict.values))
winter=tibble(row.names = rownames(performance.result.cv[3,1]$predict.values))
for (i in 1:length(performance.result.cv[3,])){
  temp.df = as.tibble(performance.result.cv[3,i]$predict.values)
  spring = cbind(spring, temp.df$spring)
  summer = cbind(summer, temp.df$summer)
  fall = cbind(fall, temp.df$fall)
  winter = cbind(winter, temp.df$winter)
}
#plot mean var relationship for probabilities
par(mfrow=c(2,2))
plot(apply(spring[,-1], 1, mean),  apply(spring[,-1], 1, var), xlab="mean", ylab="var", main="mean-variance relationship for spring predictions")
plot(apply(summer[,-1], 1, mean),  apply(summer[,-1], 1, var),xlab="mean", ylab="var",main="mean-variance relationship for summer predictions")
plot(apply(fall[,-1], 1, mean),  apply(fall[,-1], 1, var),xlab="mean", ylab="var",main="mean-variance relationship for fall predictions")
plot(apply(winter[,-1], 1, mean),  apply(winter[,-1], 1, var),xlab="mean", ylab="var",main="mean-variance relationship for winter predictions")
par(mfrow=c(2,2))
plot(apply(spring[,-1], 1, mean),  apply(spring[,-1], 1, median),xlab="mean", ylab="median",main="mean-median relationship for spring predictions")
plot(apply(summer[,-1], 1, mean),  apply(summer[,-1], 1, median), xlab="mean", ylab="median",main="mean-median relationship for summer predictions")
plot(apply(fall[,-1], 1, mean),  apply(fall[,-1], 1, median), xlab="mean", ylab="median",main="mean-median relationship for fall predictions")
plot(apply(winter[,-1], 1, mean),  apply(winter[,-1], 1, median), xlab="mean", ylab="median",main="mean-median relationship for winter predictions")

#taking mean across the number of resamplings for each sample
finalvalues = tibble(spring = apply(spring[,-1], 1, mean), summer = apply(summer[,-1], 1, mean), fall = apply(fall[,-1], 1, mean), winter = apply(winter[,-1], 1, mean))
#rename the row for the tibble
row.names(finalvalues) = spring$row.names

#generate truth vector according to the sample names
finaltruth = Mike_NAmerica$season[match(row.names(finalvalues), Mike_NAmerica$samplenames)]

#at this point, final values store a tibble for mean prediction scores for each sample across 4 seasons, finaltruth store their respective truth values
```


##formal fitting, this step prepares for further steps
```{r formal fitting}
#finding the best lambda

set.seed(1230123)
set.seed(seed = NULL)
fraction = table(Mike_NAmerica$season)/length(Mike_NAmerica$season)
weights = 1 - fraction[as.character(Mike_NAmerica$season)]

#results = replicate(3, cv.glmnet(x = t(Mike_NAmerica.master.asinh.cpm),
#               y = factor(Mike_NAmerica$season),
#               family = "multinomial", weights = weights))
set.seed(seed = NULL)
cvglmfit.final = cv.glmnet(x = t(Mike_NAmerica.master.asinh.cpm),
                           y = factor(Mike_NAmerica$season),
                     family = "multinomial", weights = c(weights))

#plot(cvglmfit)
s0 = cvglmfit.final$lambda.1se
print(s0) #good at 0.053
glmfit = glmnet(x = t(Mike_NAmerica.master.asinh.cpm),
                y = factor(Mike_NAmerica$season),
                family = "multinomial",
                intercept=FALSE)


#pred.internal.rawvalues = predict(glmfit, t(Mike_NAmerica.master.asinh.cpm),
#                        type = "response", s = s0) #provides original values of predictions for each category
pred.internal = predict(glmfit, t(Mike_NAmerica.master.asinh.cpm),
                        type = "class", s = s0)
pred.internal.values = predict(glmfit, t(Mike_NAmerica.master.asinh.cpm),
                        type = "response", s = s0)


pred.external = predict(glmfit, t(patient1.master.asinh.cpm),
                   type = "class", s = s0)
pred.external.values = predict(glmfit, t(patient1.master.asinh.cpm),
                   type = "response", s = s0)

pred.fourpeople = predict(glmfit, t(four.master.asinh.cpm), type="class", s=s0)


#internal fitting, sanity check
confusion_table.internal = table(predicted = pred.internal, truth = factor(Mike_NAmerica$season))
confusion_table.internal

confusionMatrix(data=factor(pred.internal, levels=c("spring", "summer", "fall","winter")), reference=factor(Mike_NAmerica$season), mode = "everything")

mROC.internal = multiclass.roc(as.numeric(factor(Mike_NAmerica$season)), as.numeric(factor(pred.internal, levels=c("spring", "summer", "fall", "winter"))))

mROC.internal$auc


#external validation
confusion_table.external = table(factor(pred.external, levels=c("spring", "summer", "fall","winter")), truth = factor(Patient1$season))
confusion_table.external
##this fucntion also provides a bunch of performance statistics
reference.result = confusionMatrix(data=factor(pred.external, levels=c("spring", "summer", "fall","winter")), reference=factor(Patient1$season), mode = "everything")
reference.result
#uses the performance function defined in last section, self-written function
performance.result = check.model.performance(factor(pred.external, levels=c("spring", "summer", "fall","winter")),factor(Patient1$season))
apply(performance.result[,-1], 2, function(x) {mean(as.numeric(x))})

mROC = multiclass.roc(as.numeric(factor(Patient1$season)), as.numeric(factor(pred.external[,1])))
mROC$auc

#four people data
confusion_table.fourpeople = table(predicted = pred.fourpeople, truth = factor(fourpeople$season))
confusion_table.fourpeople
confusionMatrix(data=factor(pred.fourpeople, levels=c( "fall","winter")), reference=factor(fourpeople$season))
mROC.fourpeople = multiclass.roc(as.numeric(factor(fourpeople$season)), as.numeric(factor(pred.fourpeople[,1])))
mROC.fourpeople

#RF for fun
model = randomForest(x=t(Mike_NAmerica.master.asinh.cpm),y=factor(Mike_NAmerica$season))
pred.rf = predict(model,newdata=t(Mike_NAmerica.master.asinh.cpm),type='class')#[,'TRUE',drop=TRUE]
confusion_table.rf = table(predicted = pred.rf, truth = factor(Mike_NAmerica$season))
confusion_table.rf
pred.rf.external = predict(model,newdata=t(patient1.master.asinh.cpm),type='class')
confusion_table.rf.external = table(predicted = pred.rf.external, truth = factor(Patient1$season))
confusion_table.rf.external

####


##this is to fine tune the model on external dataset, it seems s0=0.055 is great for BOTH internal and EXTERNAL datasets. This practice is over-optimistic
if(FALSE){
for (i in seq(0.01, 0.06, 0.003)){
  print(i)
  pred.external = predict(glmfit, t(patient1.master.asinh.cpm),
                   type = "class", s = i)
  confusion_table.external = table(predicted = pred.external, truth = factor(Patient1$season))

  mROC = multiclass.roc(as.numeric(factor(Patient1$season)), as.numeric(factor(pred.external[,1])))
  print(mROC$auc)
}
}


plot(glmfit, xvar = "norm", col = RColorBrewer::brewer.pal(12, "Set3"), lwd = sqrt(3))

# standardize coefficients, is this necessary?? 
#if X is the input matrix of the glmnet function,
# and cv.result is your glmnet object:
sds <- apply(t(Mike_NAmerica.master.asinh.cpm), 2, sd)
coefs <- as.matrix(coef(glmfit, s = s0)[["spring"]])
std_coefs <- coefs[-1, 1] / sds

```


##multiclass auc
```{r multiclass auc}
#use bind_cols and bind_rows for tibbles
cal_auc = function(finalvalues, finaltruth, zero_label, one_label){
  #zero_label="spring"
  #one_label="summer"
 colnames(finalvalues) = c("spring", "summer", "fall", "winter")
 final.df = bind_cols(truth=finaltruth, finalvalues)
 points = tibble(label=character(), prob=double())
 for (i in 1:dim(final.df)[1]){
   if (final.df$truth[i] == zero_label | final.df$truth[i] == one_label){
     #print(as.character(final.df$truth[i]))#
     #print(as.double(final.df[i, zero_label]))
     points = bind_rows(points, tibble(label = as.character(final.df$truth[i]), prob = as.numeric(final.df[i, zero_label])))
   }
 }
   #print(dim(points))
   colnames(points) = c("label", "zero_label_prob")
   #sorted by prob values, then we get ranks
   points = points[order(points$zero_label_prob),]
   #print(points)
   #class(points)
   n0 = 0
   n1 = 0
   sum_ranks = 0
   #i is rank here
   for (i in 1:dim(points)[1]){
     if (points$label[i] == zero_label) {
       n0 = n0 + 1
       sum_ranks = sum_ranks + i
     }
     if (points$label[i] == one_label){
       n1 = n1 + 1
     }
   }
    #print(n0)
    #print(n1)
    #print(sum_ranks)
   
    return((sum_ranks - (n0*(n0+1)/2.0)) / (n0 * n1))
}

cal_auc(finalvalues, finaltruth, "spring", "summer") 

#function generating all possible combinations from a vector, store in data.frame
combinations = function(vector){
  combos = tibble(a=character(), b=character())
  for (i in 1:(length(vector)-1)){
    for (j in (i+1):length(vector)){
      #slice = c(as.character(vector[i]), as.character(vector[j]))
      #names(slice) = c("a", "b")
      #print(vector[j])
      combos = bind_rows(combos, c(a = as.character(vector[i]), b= as.character(vector[j])))
    }
  }
  #print(combos)
  return(combos)
}

combos = combinations(seasons)

#calculating mAUC values
#combos is the combinations between two labels
cal_mAUC = function(finalvalues, finaltruth, combos){
  colnames(finalvalues) = c("spring", "summer", "fall", "winter")
  sums = 0
  mAUC = 0
  for (i in 1:dim(combos)[1]){
  sums = sums + (cal_auc(finalvalues, finaltruth, combos$a[i], combos$b[i]) + cal_auc(finalvalues, finaltruth, combos$b[i], combos$a[i]))/2
  }
  mAUC = sums * 2 / (length(seasons) * (length(seasons)-1))
  return(mAUC)
}
#this mAUC is calcualted from the pooling result of internal cv model on testing data, no external data is used
#this is a relatively conservative measurement of the model performance on the training dataset
cal_mAUC(finalvalues, finaltruth, combos)


#for external validation dataset
cal_mAUC(data.frame(pred.external.values), factor(Patient1$season), combos)

#internal test
cal_mAUC(data.frame(pred.internal.values), factor(Mike_NAmerica$season), combos)
```

##multiclass roc plotting one vs all
```{r multiclass roc plotting one vs all}
#one vs all method
cal_roc_ova = function(finalvalues, finaltruth, zero_label){
  #zero_label="spring"
  #one_label="summer"
 colnames(finalvalues)=c("spring","summer","fall","winter")
 final.df = bind_cols(truth=finaltruth, finalvalues)
 points = data.frame(label=character(), prob=double())
 for (i in 1:dim(final.df)[1]){
   if (final.df$truth[i] == zero_label){
     #print(as.character(final.df$truth[i]))#
     #print(as.numeric(final.df[i, zero_label]))
     points = rbind(points, c(label = as.character(final.df$truth[i]), prob = as.numeric(final.df[i, zero_label])))
   } else {
     points = rbind(points, c(0, prob = as.numeric(final.df[i, zero_label])))
 }}
   #print(dim(points))
   colnames(points) = c("label", "zero_label_prob")
   #sorted by prob values, then we get ranks
   points = points[order(points$zero_label_prob),]
   points$label=factor(points$label, levels =c(zero_label, 0))
   points$zero_label_prob = as.numeric(points$zero_label_prob)
   return(points)
}

#points.ova = cal_roc_ova(finalvalues, finaltruth, "spring")
#ggroc(roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, levels=c(0, "spring")))
#ggroc(roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, levels=c("spring", 0)))

plots.ova=list()
seasons = factor(seasons, levels=c("spring", "summer", "fall", "winter"))
for (i in seasons){
  points.ova = cal_roc_ova(finalvalues, finaltruth, i)
  #print(ggroc(roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, levels=c(0, i))))
  print(roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, levels=c(0, i)))
  plots.ova[[i]] = roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, ci.type="shape", levels=c(0, i))
}

ggroc(plots.ova, linetype=7, size=1.2, alpha=0.8) + geom_abline(slope=1, intercept=1,color="light grey") + theme_bw()  +
  scale_colour_manual(values = c('spring' = "#00A087FF", 'summer' = "#4DBBD5FF", 'fall' = "#E64B35FF", 'winter' = "#3C5488FF" ), name = "Seasons") + 
  annotate("text", label=paste0("AUC=",signif(plots.ova$fall$auc,3), " 95%CI: 0.487-0.661"), color="#E64B35FF", x= 0.22, y=0.15, size=4) + 
  annotate("text", label=paste0("AUC=",signif(plots.ova$spring$auc,3), " 95%CI: 0.747-0.889"), color="#00A087FF", x= 0.22, y=0.10, size =4) +
annotate("text", label=paste0("AUC=",signif(plots.ova$summer$auc,3), " 95%CI: 0.712-0.860"), color="#4DBBD5FF", x= 0.22, y=0.05, size =4)+
annotate("text", label=paste0("AUC=",signif(plots.ova$winter$auc,3), " 95%CI: 0.810-0.930"), color="#3C5488FF", x= 0.22, y=0.00, size =4)+
  labs( x = "Specificity (1-FPR)", y = "Sensitivity (TPR)", size =5) + theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14))

#ROC plots for validation dataset
plots.ova.external=list()
for (i in seasons){
  points.ova.external = cal_roc_ova(data.frame(pred.external.values), factor(Patient1$season), i)
  #print(ggroc(roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, levels=c(0, i))))
  print(roc(points.ova.external$label, points.ova.external$zero_label_prob, auc=TRUE, ci=TRUE, levels=c(0, i)))
  plots.ova.external[[i]] = roc(points.ova.external$label, points.ova.external$zero_label_prob, auc=TRUE, ci=TRUE, ci.type="shape", levels=c(0, i))
}

ggroc(plots.ova.external, linetype=7, size=1.2, alpha=0.8) + geom_abline(slope=1, intercept=1,color="light grey") + theme_bw()  +
  scale_colour_manual(values = c('spring' = "#00A087FF", 'summer' = "#4DBBD5FF", 'fall' = "#E64B35FF", 'winter' = "#3C5488FF" ), name = "Seasons") + 
  annotate("text", label=paste0("AUC=",signif(plots.ova$fall$auc,3), " 95%CI: 0.5681-0.930"), color="#E64B35FF", x= 0.22, y=0.15, size=4) + 
  annotate("text", label=paste0("AUC=",signif(plots.ova$spring$auc,3), " 95%CI: NaN-NaN"), color="#00A087FF", x= 0.22, y=0.10, size =4) +
annotate("text", label=paste0("AUC=",signif(plots.ova$summer$auc,3), " 95%CI: 0.712-0.860"), color="#4DBBD5FF", x= 0.22, y=0.05, size =4)+
annotate("text", label=paste0("AUC=",signif(plots.ova$winter$auc,3), " 95%CI: 0.676-0.975"), color="#3C5488FF", x= 0.22, y=0.00, size =4)+
  labs( x = "Specificity (1-FPR)", y = "Sensitivity (TPR)", size =5) + theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14))


```

##multiclass roc plotting pairs
```{r multiclass roc plotting pairs}

#every pair method
cal_roc = function(finalvalues, finaltruth, zero_label, one_label){
  #zero_label="spring"
  #one_label="summer"
 final.df = bind_cols(truth=finaltruth, finalvalues)
 points = data.frame(label=character(), prob=double())
 for (i in 1:dim(final.df)[1]){
   if (final.df$truth[i] == zero_label | final.df$truth[i] == one_label){
     #print(as.character(final.df$truth[i]))#
     #print(as.numeric(final.df[i, zero_label]))
     points = rbind(points, c(label = as.character(final.df$truth[i]), prob = as.numeric(final.df[i, zero_label])))
   }
 }
   #print(dim(points))
   colnames(points) = c("label", "zero_label_prob")
   #sorted by prob values, then we get ranks
   points = points[order(points$zero_label_prob),]
   points$label=factor(points$label, levels =c(zero_label, one_label))
   points$zero_label_prob = as.numeric(points$zero_label_prob)
   return(points)
}

plots=list()
for (i in 1:dim(combos)[1]){ #combos is defined in last section
  i=2
  points.a = cal_roc(finalvalues, finaltruth, combos$a[i], combos$b[i]) 
  points.b = cal_roc(finalvalues, finaltruth, combos$b[i], combos$a[i])
  points.roc.a = roc(points.a$label, points.a$zero_label_prob, auc=TRUE, ci=TRUE)
  points.roc.b = roc(points.b$label, points.b$zero_label_prob, auc=TRUE, ci=TRUE)
  print(ggroc(list(plota = points.roc.a, plotb=points.roc.b))+ggtitle(paste0(combos$a[i],"-", combos$b[i])))
}
```


##feature extraction================
```{r glmnet final fitting and feature extraction}

#for further plotting of features
#taxons with a arbitrary absolute value threshold
#use names(selecttaxons$season) to gain access to the names of taxons for respective seasons
selecttaxons = list(spring =c(), summer=c(), fall=c(), winter=c())

#weights calcualted by percentage of sum of absolute coefficients(weight)
weights = list(spring =c(), summer=c(), fall=c(), winter=c())

for (se in c("spring", "summer", "fall", "winter")){
se = "summer"
coefs = coef(glmfit)[[se]][, which.min(abs(glmfit$lambda - s0))]
coefs = sort(coefs, decreasing= TRUE)
print(se)
print(head(coefs,4))
print(tail(coefs,4))
weights[[se]] = c(abs(coefs[coefs != 0])/sum(abs(coefs[coefs != 0]))*100)
selecttaxons[[se]] = c(coefs[abs(coefs) >0.08])


if(TRUE){
  for (i in 1:4){
    if (names(coefs)[i]!=""){
    m = ggplot(as.data.frame(t(Mike_NAmerica.master.asinh.cpm)), aes_string(x=Mike_NAmerica$date.mid, y=Mike_NAmerica.master.asinh.cpm[names(coefs)[i],]))

    m1 = m + geom_point(aes(color=Mike_NAmerica$season)) + 
      ylab(paste0(names(coefs)[i], " (normalized abundance)")) +
      theme_bw() + theme(legend.position="bottom", legend.title = element_blank()) +
      xlab("Date") + 
      ggtitle(paste0(se, " positively contributing taxon, ", "coefficient = ", signif(coefs[i],3))) + scale_color_npg()
    print(m1) 
    }
  }

  for (i in 1:4){  
    if (names(coefs)[length(coefs)+1-i]!=""){
    n = ggplot(as.data.frame(t(Mike_NAmerica.master.asinh.cpm)), aes_string(x=Mike_NAmerica$date.mid, y=Mike_NAmerica.master.asinh.cpm[names(coefs)[length(coefs)+1-i],]))
    n1 = n + geom_point(aes(color=Mike_NAmerica$season)) + 
      ylab(paste0(names(coefs)[length(coefs)+1-i], " (normalized abundance)")) +  theme_bw() + 
      theme(legend.position="bottom", legend.title = element_blank()) + xlab("Date") + 
      ggtitle(paste0(se, " negatively contributing taxon, ", "coefficient = ", signif(coefs[length(coefs)+1-i],3)))+ scale_color_npg()
    print(n1)
   
    }
  }
  }
  }


```
##heatmap for features====================
```{r feature-heatmap}
setwd("~/Documents/Bioinfo/DNAformal/heatmap")


nameslist = c()
for (i in 1:4){
  nameslist = c(nameslist, selecttaxons[[i]])
}


Mike_NA.heatmap.asinh.cpm = Mike_NAmerica.master.asinh.cpm[names(nameslist),]
Mike_NA.heatmap.asinh.cpm[Mike_NA.heatmap.asinh.cpm==0] =NA
Mike_NAmerica.season = Mike_NAmerica[order(Mike_NAmerica$season),] 
Mike_NA.heatmap.asinh.cpm.season = Mike_NA.heatmap.asinh.cpm[, Mike_NAmerica.season$samplenames]
#df= data.frame(t(Mike_NA.heatmap.asinh.cpm), season=Mike_NAmerica$season)
#df.melt = melt(df, id.vars = "season")
#df.melt.scaled <- ddply(df.melt, .(variable), transform, rescale = rescale(value))
#tile = ggplot(df.melt.scaled, aes(x=season, y=variable, fill=value)) + geom_tile()
#tile

plot_aheatmap_genus = function(x, annotate.df, prefix="",main=NULL){
  
  #--------------
  # goi.obj: ExpressionSet object containg expression data(log cpm) of gene of interests and phenoData
  # main: title of plot
  #---------------	
  if (is.null(main)) main = ""
  
  wval = 12
  hval = 15
  # add color to col and row
  # CellType = rownames(x)
 # ann.col = data.frame("Humid" = annotate.df$X_Mean_Humidity, "Temperature" = annotate.df$Min_TemperatureC, "Particle" = annotate.df$total.particle, "Weekend" = annotate.df$weekend, "Season" = annotate.df$season, "Batch" = annotate.df$batch, rain = as.factor(annotate.df$is_there_rain), Precipitation = annotate.df$Precipitationmm)
   ann.col = data.frame(Season = annotate.df$season, "Humid" = annotate.df$median.humid, "Temperature" = annotate.df$median.temperature, "Particle" = annotate.df$total.particle)
   ann.row = data.frame(Season = c(rep("spring", length(selecttaxons[["spring"]])), rep("summer", length(selecttaxons[["summer"]])),rep("fall", length(selecttaxons[["fall"]])),rep("winter", length(selecttaxons[["winter"]]))))
  #color=colorRampPalette(c("#ffeda0", "#feb24c", "#f03b20"))(3)
  color = "-RdBu:50"
  #color = "-RdYlBu2:100"
  
  
  #annotation color, need to match level names and color value!!!
  seasoncolor = c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")
  names(seasoncolor) = c("spring", "summer", "fall", "winter")
  #Var2 = c("lightgreen", "navy")
  ann_colors = list(Season = seasoncolor)
  # choose dist and hc methods
  distfunvals = c("euclidean", "pearson", "spearman")
  hclustfunvals = c("centroid", "average", "complete")
  distfunval = distfunvals[1]
  hclustfunval = hclustfunvals[2]
  
  #scale="row" # scale by row
  # NA is different from FALSE for Colv and Rowv, FALSE means clustering is still calcuated just not shown. NA means not doing anything
  
  # change Colv = NA for timelapse
  Colv = NA # add dendrogram to col
  Rowv = NA # add dendrogram to row
  filePrefix = paste0(prefix,"_")
  fileName = paste(filePrefix, distfunval, "_", hclustfunval, ".pdf", sep="")
  fileName = paste(format(Sys.time(), "%d_%b_%Y"),fileName,sep="_")
  #pdf(file.path(getwd(), fileName), width=wval, height=hval)
  #labcol = as.character(annotate.df$date.mid)
  #aheatmap(x, color = color,distfun = distfunval, hclustfun = hclustfunval,fontsize = 8,main=fileName, filename=fileName)
  
  #set cellwidth = 3 for large number of samples
  aheatmap(x,  cellwidth=3, cellheight = 6, distfun = distfunval, hclustfun = hclustfunval, labCol=NA, scale= "row",
           annCol = ann.col, annRow= ann.row, Colv=Colv,Rowv=Rowv, annColors = ann_colors, color = color, fontsize = 8, filename=fileName, layout = "|.L*")
  # main = main)
  
}

plot_aheatmap_genus(x=Mike_NA.heatmap.asinh.cpm.season,annotate.df = Mike_NAmerica.season, prefix=paste0(mole.type,"-", taxonlevel), main=NULL)

#====barplot for features
ann.row = data.frame(season = c(rep("spring", length(selecttaxons[["spring"]])), rep("summer", length(selecttaxons[["summer"]])),rep("fall", length(selecttaxons[["fall"]])),rep("winter", length(selecttaxons[["winter"]]))))
ann.row$season = factor(ann.row$season, levels=rev(c("spring", "summer", "fall", "winter")))
ann.row$names = names(nameslist)
ann.row$coe = nameslist

dotchart = ggdotchart(ann.row, x = rev("names"), y = "coe",
           color = "season",                                # Color by groups
           palette = rev(c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")), # Custom color palette
           sorting = "descending",                       # Sort value in descending order
             add = "segments",    # Add segments from y = 0 to dots
           add.params = list(size = 3),
           rotate = TRUE,                                # Rotate vertically
           group = "season",                                # Order by groups
           dot.size = 6, # Large dot size
           xlab=FALSE,
           ylab="Coefficients",
           ggtheme = theme_pubr()                        # ggplot2 theme
           ) + geom_hline(yintercept = 0, linetype = 2, color = "lightgray")

#parsing weights information
selectweights = c()
for (i in c("spring", "summer", "fall", "winter")){
  selectweights = c(selectweights, weights[[i]][names(selecttaxons[[i]])])
}
weights.df = data.frame(season = ann.row$season, names = names(selectweights), weights=as.numeric(selectweights), value=as.numeric(c(rep(1, length(selectweights)))))


#for some weird reasons ggplot will plot text x-axis in a sorting order even when not asked
weights.df$names = factor(weights.df$names, levels = rev(weights.df$names))
weights.plot= ggplot(weights.df, aes(x=names, y=value, label= paste0(round(weights.df$weights, 0),"%"))) + geom_text(aes(color=season)) +
ylab("Percentage of total absolute weight") +
theme_classic2() + scale_color_manual(values=rev(c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))) +
theme(axis.ticks=element_blank(), axis.line =element_blank(), axis.text.x = element_blank()) +
coord_flip()  
ggplot_build(weights.plot)$data


#plots for selected taxa within the model to justfiy feature selection
#reset to 0 so violin plots would be useful
Mike_NA.heatmap.asinh.cpm.season[is.na(Mike_NA.heatmap.asinh.cpm.season)] = 0
Mike.heatmap.df = cbind(t(Mike_NA.heatmap.asinh.cpm.season), Mike_NAmerica.season)
ggplot(Mike.heatmap.df, aes(x=season, y=Azadirachta, color=season)) + geom_violin(draw_quantiles = c(0.25,0.5,0.75)) + geom_jitter(width=0.1) + theme_classic2() +  scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))


#ggboxplot(Mike.heatmap.df, x = "season", y = "Azadirachta",
#    color = "season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"),
#    add = "jitter", legend="bottom")

#saving all plots to list
violin.plots = list()
for (name in names(nameslist[-6])){
  name
  violin = ggviolin(Mike.heatmap.df, x = "season", y = as.character(name),
    fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"),
    add = "boxplot", add.params = list(width=0.1, fill = "white"), quantile(c(0.25, 0.5, 0.75)), legend="bottom", trim=TRUE, xlab=FALSE, tickslab=FALSE) + geom_point(size=0.5) +
    theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif", hide.ns=TRUE)
print(violin)
violin.plots[[name]] = ggpar(violin, tickslab =FALSE)
}
ggarrange(plotlist = violin.plots, ncol=3, nrow=10)


```

##picking seasonal plots
```{r pickseasonal plots}


my_comparisons <- list( c("spring", "summer"), c("spring", "fall"), c("spring", "winter"), c("summer", "fall" ), c("summer", "winter"), c("fall", "winter"))

my_comparisons <- list( c("spring", "summer"), c("spring", "fall"), c("spring", "winter"))
violin1 = ggviolin(Mike.heatmap.df, x = "season", y = "Azadirachta",
    fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"),
    add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "horizontal") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif") +theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14))

                     # Add global the p-value 
violin1

violin2 = ggviolin(Mike.heatmap.df, x = "season", y = "Fomitiporia",
    fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"),
    add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "horizontal") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif")   +theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14))  
violin2


my_comparisons <- list( c("spring", "summer"), c("summer", "fall"), c("summer", "winter"))
violin3 = ggviolin(Mike.heatmap.df, x = "season", y = "Sclerotinia",
    fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"),
    add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "horizontal") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif")    +theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14))      # Add global the p-value 
violin3

my_comparisons <- list( c("spring", "fall"), c("summer", "fall" ), c("fall", "winter"))

violin4 = ggviolin(Mike.heatmap.df, x = "season", y = "Tuber",
    fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"),
    add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "horizontal") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif")   +theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14))   # Add global the p-value 
violin4


my_comparisons <- list(  c("spring", "winter"), c("summer", "winter"), c("fall", "winter"))
violin5 = ggviolin(Mike.heatmap.df, x = "season", y = "Tricholoma",
    fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"),
    add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "horizontal") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif") +theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14))       # Add global the p-value 
violin5

ggarrange(violin1 + rremove("y.text"), violin3 + rremove("y.text"), violin4 + rremove("y.text"), violin5 + rremove("y.text"),  ncol=2, nrow=2) 
```


```{r glmnet}
glmfit = glmnet(x = t(Mike.master.asinh.cpm),
                y = factor(Mike_single$season),
                family = "multinomial")
pred = predict(glmfit, t(Mike.master.asinh.cpm),
                   type = "class", s = 0.107)
confusion_table = table(predicted = pred, truth = factor(Mike_single$season))
confusion_table

```


#Specific taxons plot
```{r specific taxon plot}

#taxons list is for taxons of special interests. There are no real needs to use it
taxonplot = function(profile, taxons=c(), top = 10, text.size=4, base.size=14){
  #ok = scores.snap
  #text.size = 2
  #ok = scores_single_complete
  
  #profile = scores_single_complete
  profile = data.frame(profile)
  master.cpm.snap = log(master.cpm.single[, profile$samplenames]+1)
  master.cpm.snap[master.cpm.snap == 0] = NA
  ori.names = rownames(master.cpm.snap)
  new.names = make.names(ori.names)
  rownames(master.cpm.snap) = new.names
  master.combined = cbind.data.frame(t(master.cpm.snap), profile)
  master.combined$date.month = factor(master.combined$date.month, levels = c(3:12, 1, 2))
  #my_comparisons <- list( c("spring", "summer"), c("spring", "fall"), c("spring", "winter"), c("summer", "fall" ), c("summer", "winter"), c("fall", "winter"))
  my_comparisons <- list( c("spring", "summer"), c("summer", "fall"), c("fall", "winter"))
  orinames = unique(c(taxons, head(ori.names, top))) #used for annotation
  taxonlist = unique(c(make.names(taxons), head(new.names,top))) #used for dataframe
   .e <- environment()
   plot.list = list()
for (i in 1:length(taxonlist)) 
  local({   
  plot1 = ggviolin(master.combined, x = "season", y = taxonlist[i],
    fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"),
    add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "vertical") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif") + stat_compare_means(label.x.npc = "left", label.y = 17) + #, method = "t.test")  
  stat_smooth(aes(group=1)) +theme_pubr(base_size = 14)  + rremove("x.text") + rremove("xlab")  + ylab(orinames[i])
  plot1$plot_env = .e
  print(plot1)
  
  plot2 = ggviolin(master.combined, x = "date.month", y = taxonlist[i],
    fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"),
    add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "vertical") + geom_point(size=0.5) + theme(legend.position="none") + stat_smooth(aes(group=1))+  stat_compare_means(label.x.npc = "center", label.y.npc = "top") + 
    theme_pubr(base_size = 14)  + rremove("legend") + ylab("") + xlab("")
  plot2$plot_env = .e
  print(plot2)
  
  plot3 = ggarrange(plot1, plot2, ncol = 2, common.legend = TRUE)
  #print(plot3)
  
  plot.list[[taxonlist[i]]] <<- plot1
  plot.list[[paste0(taxonlist[i],".month")]] <<- plot2
  plot.list[[paste0(taxonlist[i],".both")]] <<- plot3
})
 
return(plot.list)

}
scores.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2016-08-30", "%Y-%m-%d" ))

taxon.plot = taxonplot(scores.snap, taxons = c("Stereum", "Pinus", "Propionibacterium", "Aspergillus", "Penicillium", "Bacillus", "Staphylococcus" ))

taxon.plot = taxonplot(scores.snap, taxons = c("Stereum hirsutum", "Pinus taeda", "Propionibacterium acnes", "Aspergillus fumigatus", "Trametes versicolor", "Botrytis cinerea", "Melampsora pinitorqua", "Acinetobacter baumannii", "Penicillium citrinum", "Malassezia restricta"), base.size = 18, top=20)

taxon.plot = taxonplot(scores.snap, taxons = c("Flavobacteriaceae bacterium 3519-10", "Flavobacterium psychrophilum", "Flavobacterium subsaxonicum"), base.size = 18, top=5)


```


#Taxon seasonal trends plots======================
```{r misc plots}

#loop through top 50 taxonlevels create individual plots
if(FALSE){
for (i in 20:70){
#i=20
temp.df = subset(master_combined_complete, master_combined_complete[,i] >0)
p = ggplot(temp.df, aes_string(x="season", y=log(temp.df[,i]), group="season", color="season"))
p + geom_boxplot(alpha=0.7, outlier.alpha=0) + geom_jitter(size=1, width=0.2) + geom_smooth(aes(group=1),se=FALSE) + ggtitle(colnames(master_combined_complete)[i]) + ylab("log(CPM)") + xlab("Season")
#ggsave(paste0("./seasonal/DNA-",taxonlevel,"-",colnames(master_combined_complete)[i],"-season.pdf"))
p = ggplot(temp.df, aes(x=date.month, y=log(temp.df[,i]), group=date.month, color=season))
p + geom_boxplot(alpha=0.7, outlier.alpha=0) + geom_jitter(size=1, width=0.2) + geom_smooth(aes(group=1),se=FALSE) + scale_x_continuous(breaks=c(1:12)) + xlab("Month") + ylab("log(CPM)") + ggtitle(colnames(master_combined_complete)[i])
#ggsave(paste0("./seasonal/DNA-",taxonlevel,"-",colnames(master_combined_complete)[i],"-month.pdf"))
}
}

#USE THIS!!!!!!!!!!!
#facet_wrap plots, this is the better way to represent them in groups
select = c(136:155)#top 20
temp.df = melt(master_combined_complete,id.vars=colnames(master_combined_complete)[1:19],measure.vars=colnames(master_combined_complete)[select])
temp.df = subset(temp.df,value >0)
#every 20
p = ggplot(temp.df, aes(x=season, y=log(value),  group=season, color=season))
p + geom_boxplot(alpha=0.7, outlier.alpha=0)  + geom_beeswarm(size=0.1) + geom_smooth(aes(group=1),se=FALSE) + stat_compare_means(hide.ns = TRUE, label = "p.format") +
    ggtitle("") + ylab("log(CPM)") + xlab("Season") + 
    facet_wrap( ~ variable,scales="free") + theme_pubr(x.text.angle = 45) + rremove("x.text")+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))

select = c(108:116) #these are the column numbers of the first 9 phyla
special.select = c("Basidiomycota", "Ascomycota", "Streptophyta","Firmicutes","Proteobacteria","Actinobacteria", "Bacteroidetes",  "Chordata", "Arthropoda")
temp.df = melt(master_combined_complete,id.vars=colnames(master_combined_complete)[1:19],measure.vars=special.select)
#for P1
#temp.df = melt(P1_combined_complete,id.vars=colnames(P1_combined_complete)[1:19],measure.vars=special.select)
#temp.df = subset(temp.df,value >0)
#top 9
p = ggplot(temp.df, aes(x=season, y=log(value),  group=season, fill=season))
p + geom_boxplot(alpha=0.7, outlier.alpha=0.2)  + geom_beeswarm(size=0.5, alpha=0.5, show.legend= FALSE, color="black") + geom_smooth(aes(group=1),se=FALSE,show.legend = FALSE) + stat_compare_means(hide.ns = TRUE, label = "p.format", show.legend = FALSE) +
    ggtitle("") + ylab("log(CPM)") + xlab("") + 
    facet_wrap( ~ variable,scales="free", ncol = 3) + theme_pubr(base_size = 14, x.text.angle = 45) + rremove("x.text")+ 
   scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))

#formal test
p.result = compare_means( formula = value ~ season, data = temp.df,  group.by = "variable", method = "kruskal.test")

#first separated by seasons, then separate by weekday/weekend
p = ggplot(temp.df, aes(x=season, y=log(value),  fill=weekend))
#p = ggplot(temp.df, aes(x=season, y=log(value),  group=season, color=season))
p + geom_boxplot(alpha=0.7, outlier.alpha=0)  + geom_beeswarm(size=0.1) + geom_smooth(aes(group=1),se=FALSE) + 
    ggtitle("") + ylab("log(CPM)") + xlab("Season") + 
    facet_wrap( ~ variable,scales="free") + theme_pubr(x.text.angle = 45) + rremove("x.text")+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))


#proving seasons are different
fit = lm(log(Streptophyta) ~ season, subset(master_combined_complete, Streptophyta >0))
summary(fit)
anova(fit)
#p = ggplot(subset(master_combined_complete, Corynebacterium >0), aes(x=drywet, y=log(Corynebacterium), group=drywet, color=season))
#p + geom_boxplot() + geom_jitter(size=1, width=0.2) + geom_smooth(aes(group=1),se=FALSE)


```

#regression model fitting for individual taxons taxons=========================linear regression model fitting
```{r linear regression model}
mikespecial = data.frame(t(Mike.master.cpm), Mike)
mikespecial = mikespecial[complete.cases(mikespecial),]

lm.df = master_combined_complete
lm.df = data.frame(t(patient1.master.cpm), Patient1) #for only patient
lm.df = data.frame(t(campushome.master.cpm), campushome) #for campushome data
lm.df = data.frame(t(Mike.master.cpm), Mike) # for all mike samples
lm.df = lm.df[complete.cases(lm.df),]

# 0s are not considered
#It appears that for different species, the variables that explain the variances the most change as well.
#current metadata, temperature, humidity, particle, season, weekend
#it seems best model is different for different genus
#full model
m.viri = lm(log(Viridiplantae) ~ season + particle, data = subset(mikespecial, Viridiplantae>0))
m.Fungi = lm(log(Fungi) ~ season + particle + humid +temperature, data = subset(mikespecial, Fungi>0))
m.Bacteria = lm(log(Bacteria) ~ season +  humid + temperature, data = subset(mikespecial, Bacteria>0))
m.Metazoa = lm(log(Metazoa) ~  particle + humid, data = subset(mikespecial, Metazoa>0))
m.Viruses = lm(log(Viruses) ~ season + particle + humid + temperature, data = subset(mikespecial, Viruses>0))

#simplified model
m = lm(log(Pinus) ~ humid + particle + season, data = subset(mikespecial, Pinus > 0))
m = lm(log(Pinus.taeda) ~ particle + season, data = subset(mikespecial, Pinus.taeda > 0))

m = lm(log(Propionibacterium) ~ season + particle + humid + temperature + weekend, data = subset(mikespecial, Propionibacterium>0))

# 0s are not considered
#It appears that for different species, the variables that explain the variances the most change as well.
#current metadata, temperature, humidity, particle, season, weekend
#it seems best model is different for different genus
#full model
totalspecial = data.frame(t(master.cpm.single), scores_single_complete)
totalspecial = totalspecial[complete.cases(totalspecial),]

#for all samples, everyone pooled
m.viri = lm(log(Viridiplantae) ~ season + particle + humid + temperature, data = subset(totalspecial, Viridiplantae>0))
m.Fungi = lm(log(Fungi) ~ season + particle + humid +temperature, data = subset(totalspecial, Fungi>0))
m.Bacteria = lm(log(Bacteria) ~ season + particle + humid + temperature, data = subset(totalspecial, Bacteria>0))
m.Metazoa = lm(log(Metazoa) ~ season + particle + humid + temperature, data = subset(totalspecial, Metazoa>0))
m.Viruses = lm(log(Viruses) ~ season + particle + humid + temperature, data = subset(totalspecial, Viruses>0))
m = lm(log(Bacteria) ~ season + particle + humid + temperature + weekend, data = subset(lm.df, Bacteria>0))
m = lm(log(Fungi) ~ season + particle + humid + temperature, data = subset(lm.df, Fungi>0))
m2 = lm(log(Fungi) ~ humid + season, data = subset(lm.df, Fungi>0))
m = lm(log(Streptophyta) ~ humid + temperature + season + particle + weekend, data = subset(lm.df, Streptophyta>0))
m2 = lm(log(Streptophyta) ~ humid + season + temperature, data = subset(lm.df, Streptophyta>0))
m = lm(log(Firmicutes) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Firmicutes>0))
m = lm(log(Basidiomycota) ~ season + particle + humid + temperature + weekend, data = subset(lm.df, Basidiomycota>0))
m = lm(log(Ascomycota) ~ season + particle + humid + temperature + weekend, data = subset(lm.df, Ascomycota>0))
m = lm(log(Actinobacteria) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Actinobacteria>0))
m = lm(log(Proteobacteria) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Proteobacteria>0))
m = lm(log(Arthropoda) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Arthropoda>0))
m = lm(log(Bacteroidetes) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Bacteroidetes>0))
m = lm(log(Chlorophyta) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Chlorophyta>0))


m = lm(log(Viridiplantae) ~ season + particle + humid + temperature, data = subset(mikespecial, Viridiplantae>0))
m = lm(log(Fungi) ~ season + particle + humid +temperature, data = subset(mikespecial, Fungi>0))
#one can compare the models directly using anova functions, such like anova(m1,m2) to see if they are siginificantly different.
```

#PCA plots with features======================PCA all plots
```{r PCA plots all}

pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=ownership))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) +scale_colour_brewer(palette = "Paired") + theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_ownership_all.pdf"))


#scale_colour_gradient2 uses two color for diverging color scheme
pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=duration))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient2(midpoint=8, low="blue", mid="white", high="red",  space = "Lab", na.value = "grey50", guide = "colourbar") + theme_bw()


#scale_colour_gradientn uses multiple colors for gradient color
# topo.colors(10)  terrain.colors(10)  rainbow(10)


pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=as.numeric(date.month)))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient2(midpoint=6, low=muted("red"), high=muted("blue")) + theme_bw()
  
pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=season))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point( size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_brewer(palette = "Set1")+theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_all_seasons.pdf"))
  
pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=humid))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point( size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradientn(colours = topo.colors(10)) +theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_all_humid.pdf"))

pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=temperature))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point( size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient2(midpoint=25, low="blue", mid="white", high="red",  space = "Lab", na.value = "grey50", guide = "colourbar")  +theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_all_temperature.pdf"))

pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=weekend))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point( size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5)  +theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_all_weekend.pdf"))

#========PCA Campushome plots

pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=as.numeric(date.month)))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red") + theme_bw()
 ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_month.pdf"))

pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=season))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point( size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_brewer(palette = "Set1")+theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_season.pdf"))
    
pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=humid))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point( size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red") +theme_bw() 
ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_humid.pdf"))

pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=duration))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red", space = "Lab") + theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_duration.pdf")) 

pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=temperature))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red", space = "Lab") + theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_temperature.pdf")) 

pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=weekend))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5)  + theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_weekend.pdf")) 

#===============PCA Mike plots
  
pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=as.numeric(date.month)))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red") + theme_bw()
 ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_month.pdf"))

pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=season))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point( size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_brewer(palette = "Set1")+theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_season.pdf"))
    
pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=humid))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point( size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red") +theme_bw() 
ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_humid.pdf"))

pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=duration))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red", space = "Lab") + theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_duration.pdf")) 

pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=temperature))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red", space = "Lab") + theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_temperature.pdf")) 

pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=weekend))
pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) +
  geom_point(size = 3.5) +
  geom_text(size = 4, nudge_y = 0.5)  + theme_bw()
ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_weekend.pdf")) 
```


##PCA 3D plots=====================PCA1.3 plots
```{r PCA 3D plots}
# 3D PCA
library(rgl)

plot3d(scores[, 1:3], type='s',radius=0.5,  col=as.integer(scores$season)+2, box=FALSE)
text3d(scores[, 1:3], texts = location, adj=1.3)
legend3d("topright", legend=as.factor(scores$season), pch=16, col=as.integer(scores$season)+2)
legend3d("toprgiht", legend=c("spring", "winter", "summer", "fall"), pch=16, col=(as.integer(scores$season)+2))

library(plot3D)
with(scores, scatter3D(x = PC1, y = PC2, z = PC3, colvar = as.integer(ownership), ticktype = "detailed",
	phi=0, bty ="g", pch =20, xlab="PC1" , ylab="PC2", zlab="PC3", cex =1.5, clab = c("Ownership")))
text3D(x = scores$PC1, y = scores$PC2, z = scores$PC3, labels = scores$location, add =TRUE, colkey = FALSE, cex =1.0)
```


# plot heatmap=============
```{r heatmap}

setwd("~/Documents/Bioinfo/DNAformal/")


plot_aheatmap_genus = function(x, annotate.df, prefix="",main=NULL){
  
  #--------------
  # goi.obj: ExpressionSet object containg expression data(log cpm) of gene of interests and phenoData
  # main: title of plot
  #---------------	
  if (is.null(main)) main = ""
  
  wval = 12
  hval = 15
  # add color to col and row
  # CellType = rownames(x)
 # ann.col = data.frame("Humid" = annotate.df$X_Mean_Humidity, "Temperature" = annotate.df$Min_TemperatureC, "Particle" = annotate.df$total.particle, "Weekend" = annotate.df$weekend, "Season" = annotate.df$season, "Batch" = annotate.df$batch, rain = as.factor(annotate.df$is_there_rain), Precipitation = annotate.df$Precipitationmm)
   ann.col = data.frame(Season = annotate.df$season,
                        Geo = annotate.df$geo2,
                        Humid = as.numeric(annotate.df$X_Mean_Humidity),
                        Temperature = annotate.df$Mean_TemperatureC, 
                        Particle = annotate.df$total.particle, 
                        Duration =annotate.df$duration)
   if (mole.type == "DNA"){
   ann.row = data.frame(Taxon = c("Fungi", "Fungi", "Viridiplantae", "Bacteria","Bacteria", "Bacteria", "Metazoa", "Bacteria", "Metazoa", "Metazoa", "Bacteria", "Metazoa", "Bacteria", "Metazoa", "Viridiplantae", "Alveolata", "Bacteria", "Bacteria", "Archaea", "Bacteria"))
   ann.row$Taxon = factor(ann.row$Taxon, levels = c("Bacteria", "Fungi", "Viridiplantae", "Metazoa","Alveolata", "Archaea"))
   }
   if (mole.type == "RNA"){
     ann.row = data.frame(Taxon = c("Bacteria","Bacteria", "Bacteria","Fungi",  "Viridiplantae", "Fungi", "Bacteria", "Bacteria","Bacteria","Bacteria", "Metazoa",  "Bacteria", "Bacteria", "Metazoa", "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Metazoa", "Bacteria"))
   ann.row$Taxon = factor(ann.row$Taxon, levels = c("Bacteria", "Fungi", "Viridiplantae", "Metazoa","Alveolata", "Archaea"))
   } 
     
  # Specify colors
  # Var1 = rainbow(length(unique(CellType)),alpha=0.75)
  # names(Var1) = c(unique(CellType))
  # ann.colors = list(CellType = Var1)
  seasoncolor = c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")
  taxoncolor = c("#F8766D", "#B79F00", "#00BA38", "#00BFC4", "#619CFF", "#F564E3")
  names(seasoncolor) = c("spring", "summer", "fall", "winter")
  names(taxoncolor) = c("Bacteria", "Fungi", "Viridiplantae", "Metazoa","Alveolata", "Archaea")
  #Var2 = c("lightgreen", "navy")
  ann_colors = list(Season = seasoncolor, Taxon = taxoncolor)
  #color=colorRampPalette(c("#ffeda0", "#feb24c", "#f03b20"))(3)
  color = "-RdBu:50"
  #color = "-RdYlBu:80"
  
  # choose dist and hc methods
  distfunvals = c("euclidean", "pearson", "spearman")
  hclustfunvals = c("centroid", "average", "complete")
  distfunval = distfunvals[1]
  hclustfunval = hclustfunvals[2]
  
  #scale="row" # scale by row
  # NA is different from FALSE for Colv and Rowv, FALSE means clustering is still calcuated just not shown. NA means not doing anything
  
  # change Colv = NA for timelapse
  Colv = NA # add dendrogram to col
  Rowv = NA # add dendrogram to row
  fileName = paste(prefix,"_", distfunval, "_", hclustfunval, ".pdf", sep="")
  fileName = paste(format(Sys.time(), "%d_%b_%Y"), fileName, sep="_")
  #pdf(file.path(getwd(), fileName), width=wval, height=hval)
  labcol = as.character(annotate.df$date.mid)
  #aheatmap(x, color = color,distfun = distfunval, hclustfun = hclustfunval,fontsize = 8,main=fileName, filename=fileName)
  
  #set cellwidth = 3 for large number of samples
  aheatmap(x, cellwidth=3, 
           distfun = distfunval, hclustfun = hclustfunval, 
           labCol = NA, 
           annCol = ann.col, annRow = ann.row, annColors = ann_colors,
           Colv=Colv,Rowv=Rowv, color = color, na.color = "#d9d9d9",
           fontsize = 19, filename=fileName, width = 20, height =12,                
           layout = c("dmal", "maldL*"), cexAnn = 1.6)
  # main = main)
  #"|.L*"
}

my.keep = TRUE
if (taxonlevel != "SUPERKINGDOM" ){
my.keep = 2:21 #this is intended for skipping "unclassified", please check
}

temp.asinh.df = master.asinh.cpm.single
temp.asinh.df[temp.asinh.df==0] = NA #set NA

plot_aheatmap_genus(x=temp.asinh.df[my.keep, ], annotate.df = scores_single_complete, prefix=paste0(mole.type,"-", taxonlevel), main=NULL)
```

##Plot hierarchial cluster===============
```{r hierarchial cluster}

#d = vegdist(t(master.asinh.cpm), na.rm = TRUE)  #since I am to plot master.cpm data, should not use master.asinh.spm here)
d = vegdist(t(master.cpm.single), na.rm = TRUE)
dendro = as.dendrogram(hclust(d, method = "average"))
library(ggdendro)
#mole.type
#taxonlevel 
ggdendrogram(dendro, rotate=TRUE, theme_dendro = FALSE)  + ggtitle(paste0(mole.type, "-", taxonlevel, " data clustering_not normalized"))+  theme_pubr()


dend_data = dendro_data(dendro, type = "rectangle")
names(dend_data) 
dendro.labels = dend_data$labels$label
dendro.labels = as.character(dendro.labels)

#this is to extract location information for dendrogram (need updates) what is this for?
dendro.location = scores_single_complete$location[match(dendro.labels, scores_single_complete$samplenames)]
dendro.location = rev(dendro.location)


```

## plot bar plot by the order of hierarichal cluster!=======
```{r barplot}

#this is best working with superkingdom data for now, might expand in the future.
#profile is supposed to be a derivative of scores_single_complete


expobarplot = function(profile, width = 16, height= 4, ratio=1){
  #profile = scores_single_complete
  master.cpm.snap = master.cpm[, profile$samplenames]
  d = vegdist(t(master.cpm.snap), na.rm = TRUE)
  dendro = as.dendrogram(hclust(d, method = "average"))
  library(ggdendro)
  #ggdendrogram(dendro, rotate=TRUE, theme_dendro = FALSE)  + ggtitle(paste0(mole.type, "-", taxonlevel, " data clustering_not normalized"))+  theme_pubr()
  dend_data = dendro_data(dendro, type = "rectangle")
  #these are labels and location label for each sample
  dendro.labels = dend_data$labels$label
  dendro.labels = as.character(dendro.labels)
  dendro.location = scores$location[match(dendro.labels, scores$samplenames)]
  print(dendro.location)
  dendro.geo2 = scores$geo2[match(dendro.labels, scores$samplenames)]
  dendro.date = scores$date.mid[match(dendro.labels, scores$samplenames)]
  
  sum.all = apply(master.cpm.snap,2,sum)
  master.perc = master.cpm.snap #the simplest way to create a data frame of same dimension
  for (i in 1:ncol(master.perc)){
    master.perc[,i] = master.cpm.snap[,i]/sum.all[i]*100
  }

  plot.df = data.frame(t(master.perc))
  plot.df$sampleID = rownames(plot.df)
  plot.df$location = dendro.location
  plot.df$date = dendro.date
  plot.df$geo = dendro.geo2
  plot.df.hcl = plot.df[dendro.labels, ] #clustered dataframe
  plot.df$step = 1:nrow(plot.df) #must need continous variable for area plots
  plot.df.hcl$step = 1:nrow(plot.df.hcl) #must need continuous variable for area plots
  plot2 = melt(plot.df,id.var=c("sampleID","location", "date", "geo", "step"))
  plot2.hcl = melt(plot.df.hcl, id.var=c("sampleID","location", "date", "geo", "step"))
  
  colnames(plot2)[6]=taxonlevel
  colnames(plot2.hcl)[6]=taxonlevel
  #change the colors used for each label
  if (colnames(plot2)[6] == "SUPERKINGDOM") # This is to unify color scheme of DNA and RNA plots
    {
    plot2$SUPERKINGDOM <- factor(plot2$SUPERKINGDOM, levels = c("Bacteria", "Fungi", "Viridiplantae", "Metazoa", "Viruses", "Archaea"))
    plot2.hcl$SUPERKINGDOM <- factor(plot2.hcl$SUPERKINGDOM, levels = c("Bacteria", "Fungi", "Viridiplantae", "Metazoa", "Viruses", "Archaea"))
    }

  #dendro.labels are clustered samples
  #use scale_color_manual for specify defined qualitative color sets!
  p = ggplot(plot2.hcl,aes(x=sampleID,y=value,fill = get(taxonlevel))) +
  geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + 
  scale_x_discrete(name = "Samples", limits=c(dendro.labels)) + ylab("Proportion") +
  theme_pubr(legend = c("top"), base_size = 16, x.text.angle = 45 ) + 
  theme(axis.text.x = element_blank()) + 
  labs(fill = "Taxons")
  print(p)
  ggsave(paste0("~/Documents/Bioinfo/DNAformal/", taxonlevel,"_barplot_clustered.pdf"), height=height, width = width*ratio) #good ratio for wide screen
  
  p.area = ggplot(plot2.hcl,aes(x=step,y=value,fill = get(taxonlevel))) +
  geom_area(position="fill", alpha=1, linetype=0) + 
  theme_pubr(legend = c("top"), base_size = 16, x.text.angle = 45 ) + 
  theme(axis.text.x = element_blank()) + 
  scale_x_continuous(expand = c(0, 0))+
  labs(x= "Samples", y="Proportion", fill = "Taxons")
  print(p.area)
  ggsave(paste0("~/Documents/Bioinfo/DNAformal/", taxonlevel,"_barplot_clustered_area.pdf"), height=height, width = width*ratio) #good ratio for wide screen
#timeline
  q = ggplot(plot2,aes(x=sampleID, y=value,fill = get(taxonlevel))) +
  geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))+ labs(y = "Proportion", fill="Taxons") +
    scale_x_discrete(name = "Samples", limits=c(profile$samplenames), labels= c(profile$date.mid)) +
    theme_pubr(legend = c("top"), base_size = 16) +
     theme(axis.text.x = element_blank())
  print(q)
  ggsave(paste0("~/Documents/Bioinfo/DNAformal/", taxonlevel,"_barplot_timeline.pdf"), height= height, width = width*ratio)
  q.area = ggplot(plot2,aes(x=step, y=value,fill = get(taxonlevel))) +
  geom_area(position = "fill") + 
      theme_pubr(legend = c("top"), base_size = 16, x.text.angle = 45 ) + 
  theme(axis.text.x = element_blank()) + 
      scale_x_continuous(expand = c(0, 0))+
  labs(x= "Samples", y="Proportion", fill = "Taxons")
  print(q.area)
  ggsave(paste0("~/Documents/Bioinfo/DNAformal/", taxonlevel,"_barplot_timeline_area.pdf"), height= height, width = width*ratio) 
  return(list(cluster=p, cluster.area=p.area, timeline=q, timeline.area=q.area, locations= dendro.location))
}

mike.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-06-01", "%Y-%m-%d" ))
p1.snap = subset(scores_single_complete, ownership == "No1_1636-69-063" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-06-01", "%Y-%m-%d" ))
p3.snap = subset(scores_single_complete, ownership == "No3_San Mateo" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-06-01", "%Y-%m-%d" ))


scores.snap = scores_single_complete

#formal plots
expobarplot(profile = scores.snap)

mike.snap = Mike_single
locations = expobarplot(profile = mike.snap, ratio = nrow(mike.snap)/nrow(scores.snap))


locations1 = expobarplot(profile = p1.snap, ratio = nrow(p1.snap)/nrow(scores.snap)+0.06)


locations2 = expobarplot(profile = p3.snap, ratio = nrow(p3.snap)/nrow(scores.snap)+0.06)

#old codes
p = ggplot(plot2,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) +
  geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + 
  theme(axis.text.x = element_text(angle = 90, hjust=1))+ scale_x_discrete(name = "Samples", limits=c(dendro.labels)) +
   scale_fill_manual(values=c("#F3C300", "#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26"))
p
ggsave(paste0("./pdfs/", taxonlevel,"_barplot_all_distinct.pdf")) 


p = ggplot(plot2,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) +
  geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + 
  theme(axis.text.x = element_text(angle = 90, hjust=1))+ 
  scale_x_discrete(name = "Samples", limits=c(rownames(scores_single)), labels= c(scores_single$date.mid)) + scale_fill_manual(values=c("#F3C300", "#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26"))
p
ggsave(paste0("./pdfs/", taxonlevel,"_barplot_all_timeline_distinct.pdf")) 

#change labels to dates
#codes to rescale the dateframe according things looked at
check = match(plot2$sampleID, rownames(campushome_single))
plot2.campushome_single = plot2[which(!is.na(check)),]
rownames(plot2.campushome_single) = seq(1:dim(plot2.campushome_single)[1])

p = ggplot(plot2.campushome_single,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) +
  geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + 
  theme(axis.text.x = element_text(angle = 90, hjust=1))+ 
  scale_x_discrete(name = "Samples", limits=c(rownames(campushome_single)), labels= c(campushome_single$date.mid) ) 
p
ggsave(paste0("./pdfs/", taxonlevel,"_barplot_campushome_timeline.pdf")) 

p = ggplot(plot2.campushome_single,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) +
  geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + 
  theme(axis.text.x = element_text(angle = 90, hjust=1))+ 
  scale_x_discrete(name = "Samples", limits=c(rownames(campushome_single)), labels= c(campushome_single$date.mid) )  + scale_fill_manual(values=c("#F3C300", "#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26"))
p
ggsave(paste0("./pdfs/", taxonlevel,"_barplot_campushome_timeline_distinct.pdf")) 


#codes to rescale the dateframe according things looked at
check = match(plot2$sampleID, rownames(Mike_single))
plot2.Mike_single = plot2[which(!is.na(check)),]
rownames(plot2.Mike_single) = seq(1:dim(plot2.Mike_single)[1])

p = ggplot(plot2.Mike_single,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) +
  geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + theme_gray(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust=1))+ 
  scale_x_discrete(name = "Samples", limits=c(rownames(Mike_single)), labels= c(Mike_single$date.mid)) + ylab("Proportion") 
p
ggsave(paste0("./", taxonlevel,"_barplot_Mike_timeline.pdf"), height=8, width = 30) 

p = ggplot(plot2.Mike_single,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) +
  geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + 
  theme(axis.text.x = element_text(angle = 90, hjust=1))+ 
  scale_x_discrete(name = "Samples", limits=c(rownames(Mike_single)), labels= c(Mike_single$date.mid)) + scale_fill_manual(values=c("#F3C300", "#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26"))
p
ggsave(paste0("./pdfs/", taxonlevel,"_barplot_Mike_timeline_distinct.pdf")) 
```

#cheminfo==============
```{r cheminfo}
setwd("~/Documents/Bioinfo/DNAformal")
#==================parsing chemi-info list, many samples are still missing as of now, needs more sequencing

infile = read.csv("../chemi.csv", stringsAsFactors = FALSE)
infile$filter.start=as.Date(infile$filter.start, "%m/%d/%y")
infile$filter.collect=as.Date(infile$filter.collect, "%m/%d/%y")

test.n = c()
for (i in 1:length(infile$filter.start)){
  for (j in 1:sum(!is.na(environ.mike$date.start))){
    if (infile$filter.start[i] == environ.mike$date.start[j]){
      print(c(infile$filter.start[i], environ.mike$date.start[j]))
      print(c(infile$comments[i], environ.mike$location[j], environ.mike$Filter_No[j]))
      test.n = c(test.n, environ.mike$Filter_No[j])
    }
  }
}
test.n
chemilist = c(test.n[1:2], "205",test.n[3:length(test.n)]) #205 is the background control
infile = cbind(infile, Filter_No=chemilist)
#these are the confirmed filter number of interests

#chemilist %in% scores_single_complete$Filter_No
scores_chemi = scores_single_complete[which(scores_single_complete$Filter_No %in% chemilist),]

chemiprofile = read.csv("~/Documents/Bioinfo/places/chemiprofile.csv", stringsAsFactors=FALSE)
#chemicals and corresponding IDs with checkable names
validchem = chemiprofile$MetabID[chemiprofile$MetabID != ""]
validids = chemiprofile$Primary.ID[chemiprofile$MetabID != ""]
chemi.meta = data.frame(ID=chemiprofile$Primary.ID, mz=chemiprofile$mz, metaID = chemiprofile$MetabID, retention=chemiprofile$rt, fold=chemiprofile$Fold_Enrich)

rownames(chemiprofile) = chemiprofile$Primary.ID
cal.df = chemiprofile[,6:(dim(chemiprofile)[2]-3)]
cal.df = t.data.frame(cal.df)
group = cal.df[,1]
cal.df = apply(cal.df[,-1], 2, as.numeric) #this makes it matrix again
cal.df = data.frame(cal.df)
cal.df$group = c(group)
result = split(cal.df, f = cal.df$group)
#add each vector into a list, then combine them from list into dataframe
new.cal.list = list()
for (i in 1:length(result)){
    #apply function aways covert your data to matrix, therefore foring all data into one class!!
    temp = apply(result[[i]][,1:(dim(result[[i]])[2]-1)], 2, mean)
    #print(unique(result[[i]]$group))
    #vectors , just like Matrix, CAN ONLY HAVE ONE TYPE!
    temp = data.frame(temp)
    colnames(temp) =  as.character(unique(result[[i]]$group))
    #print(temp[length(temp)])
    new.cal.list[[i]] = temp
}
new.cal.df = as.data.frame(new.cal.list) #all chemialcs being rows, all samples being columns
new.cal.check = new.cal.df
colnames(new.cal.check) = infile$X  #looks ok
colnames(new.cal.df) = infile$Filter_No
rownames(new.cal.df) = chemiprofile$Primary.ID[-1]
chemi.environ = join(scores_chemi, infile, by="Filter_No")
#HOTFIXES
chemi.environ$location[18] = "Davis-CA"
chemi.environ$location[10] = "Montana"
#use chemi.environ for meta information

#ordering samples in chronic order, because samples are arranged so in chemi.environ
chemi.profile = new.cal.df[, chemi.environ$Filter_No]
chemi.profile.check = chemi.profile
colnames(chemi.profile.check) = chemi.environ$X

colnames(chemi.profile) = chemi.environ$samplenames
chem.species.profile = master.cpm.single[, chemi.environ$samplenames]
#used 100
chem.species.profile = chem.species.profile[apply(chem.species.profile, 1, sd) > 10 , ]
#threshold 1000
chemi.profile = chemi.profile[apply(chemi.profile, 1, sd) > 1000, ]
#filtering the uninterested ones
#chemi.profile.filter = chemi.profile[rowSums(chemi.profile == 0) <= 4, ]
#chem.species.profile.filter = chem.species.profile[rowSums(chem.species.profile == 0) <= 8,]
chemi.profile.filter = chemi.profile[rowSums(chemi.profile == 0) <= 4, ]
chem.species.profile.filter = chem.species.profile[rowSums(chem.species.profile == 0) <= 8,]

#lm.test = lm(chem.species.profile.filter[1,] ~., data = data.frame(t(chem.species.profile.filter)))


pca1 = dudi.pca(t(chemi.profile.filter), scal = TRUE, scann = FALSE, nf=3)
pca2 = dudi.pca(t(chem.species.profile.filter), scal = TRUE, scann = FALSE, nf=3)
#this step is slow, try not to run it
#set.seed = 12345
set.seed = 1234
rv1 = RV.rtest(pca1$tab, pca2$tab,499)
rv1

#mess around
if(FALSE){
X = pca1$tab
Y = pca2$tab
S.xy <- cov(X, Y)
S.xx <- var(X)
S.yx <- cov(Y, X)
S.yy <- var(Y)
A <- eigen(solve(S.xx) %*% S.xy %*% solve(S.yy) %*% S.yx)$vectors
B <- eigen(solve(S.yy) %*% S.yx %*% solve(S.xx) %*% S.xy)$vectors
R <- sqrt(eigen(solve(S.yy) %*% S.yx %*% solve(S.xx) %*% + S.xy)$values)
}

chemi.profile.filter = log(chemi.profile.filter+1, base=10)#log transformation is typical in chemical world
chem.species.profile.filter = asinh(chem.species.profile.filter)

ccaRes = PMA::CCA(t(chem.species.profile.filter), t(chemi.profile.filter), penaltyx = 0.20, penaltyz = 0.15)
ccaRes

combined = cbind(t(chem.species.profile.filter[ccaRes$u != 0, ]), t(chemi.profile.filter[ccaRes$v != 0, ]))

#everything, without sCCA selection
combined.original = cbind(t(chem.species.profile.filter), t(chemi.profile.filter))
#pcaRes = dudi.pca(combined.original, scannf = FALSE, nf = 3)
chem_first_batch = c("chaoexpo22_I12",  "chaoexpo51_I6"  , "chaoexpo56_I5",   "chaoexpo44_I15",  "chaoexpo28_I15")

#function for drawing all plots)
chempca = function(pcaRes=pcaRes,eclispe=0.7, base.size = 14, ratio=2, ind.alpha = 0.8, bi.alpha=0.6){
  
scree = fviz_screeplot(pcaRes, addlabels = TRUE )
print(scree)

#working plots
ind.1 = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$season[6:20],  pointsize=5, pointshape=21, palette="npg", alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe ,invisible = "quali")  +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) +  geom_text_repel(aes(label = chemi.environ$location[6:20]), size=4) + theme_pubr(margin = FALSE, base_size = base.size) + labs(fill = "Season", color = "Season") + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) + scale_color_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF"))
print(ind.1)

ind.1.clear = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$season[6:20],  pointsize=5, pointshape=21, palette="npg", alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe ,invisible = "quali" )  +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + theme_pubr(margin = FALSE, base_size= base.size) + labs(fill = "Season", color = "Season") + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) + scale_color_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF"))
print(ind.1.clear)

ind.1.path =fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$season[6:20],  pointsize=5, pointshape=21, palette="npg", alpha.ind = ind.alpha-0.2, addEllipses =TRUE, ellipse.level = eclispe ,invisible = "quali" ) +
 geom_segment(aes(xend=c(tail(pcaRes$li$Axis1, n=-1), NA), yend=c(tail(pcaRes$li$Axis2, n=-1), NA)),
      arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"),  linetype=1, size=1, alpha=0.7,color = c(rep("#3C5488FF",6), rep("#00A087FF",9))) +
   coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + theme_pubr(margin = FALSE, base_size = base.size) + labs(fill = "Season", color = "Season")  + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) + scale_color_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF"))
print(ind.1.path)  


ind.2 = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$location[6:20],  pointsize=5, pointshape=21, alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe,invisible = "quali" )  +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) +  geom_text_repel(aes(label = chemi.environ$comments[6:20]), size=4) + theme_pubr(margin = FALSE, base_size = base.size) + scale_fill_npg() + scale_color_npg()  + 
  labs(fill = "Location", color = "Location")
print(ind.2)

ind.2.clear = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$location[6:20],  pointsize=5, pointshape=21, alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe,invisible = "quali" )  +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) +  
theme_pubr(margin = FALSE, base_size = base.size) + scale_fill_npg() + scale_color_npg() + 
  labs(fill = "Location", color = "Location")
print(ind.2.clear)


ind.3 = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$season[6:20],  pointsize=5, pointshape=21, palette="npg", alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe,invisible = "quali" ) +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) +  geom_text_repel(aes(label = as.character(chemi.environ$date.start[6:20])), size=4) + theme_pubr(margin = FALSE, base_size = base.size) + 
  labs(fill = "Season", color = "Season")   + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) + scale_color_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF"))
print(ind.3)

ind.4 = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$location[6:20],  pointsize=5, pointshape=21, palette="npg", alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe,invisible = "quali" )  +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) +  geom_text_repel(aes(label = as.character(chemi.environ$date.start[6:20])), size=4) + theme_pubr(margin = FALSE, base_size = base.size) + scale_fill_npg() + scale_color_npg() + labs(fill = "Location", color = "Location")  
print(ind.4)

chem.special = c("PM1888", "PM0594", "PM1527", "PM1013", "PM0006", "PM1180", "PM3175", "PM3177", "NM1772", "Tricholoma", "Trametes", "Serpula","Psychrobacter", "Fomitopsis", "Pseudogymnoascus", "Ophiognomonia", "Phlebiopsis", "Gymnopus", "Lentinus", "Coniosporium")
names.features = c(head(rownames(pcaRes$c1)[order(pcaRes$c1$CS1)],10),tail(rownames(pcaRes$c1)[order(pcaRes$c1$CS1)],20),
               head(rownames(pcaRes$c1)[order(pcaRes$c1$CS2)],7),tail(rownames(pcaRes$c1)[order(pcaRes$c1$CS2)],14)
               )
names.features = unique(names.features)
gradient.color = c("#3C5488FF","#4DBBD5FF","#E64B35FF")

var = get_pca_var(pcaRes)
  #another way of picking names, based on contributions to axes
  var$contrib = var$contrib[order(var$contrib[,1]^2,var$contrib[,2]^2, var$contrib[,3]^2, decreasing = TRUE),]
  #names.features = row.names(var$contrib[1:50,])
  mm = corrplot::corrplot(var$contrib[1:50,], is.corr=FALSE, cl.align.text = "l")
  contrib = fviz_contrib(pcares, choice = "var", axes = 1:3, top = 100,ggtheme = theme_pubr(base_size = 6))
print(contrib)

#select.var= list(name=names),
var.plot = fviz_pca_var(pcaRes,  
             geom=c("text","arrow"),
             select.var= list(name=c(names.features, chem.special)), 
             col.var="contrib",
             labelsize = 4,
             #gradient.cols = "npg",
             gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"),
             repel = TRUE) + theme_pubr(margin = FALSE, base_size = base.size) #plot features(loadings)
print(var.plot)

names.biplot = c(head(rownames(pcaRes$c1)[order(pcaRes$c1$CS1)],4),tail(rownames(pcaRes$c1)[order(pcaRes$c1$CS1)],8),
               head(rownames(pcaRes$c1)[order(pcaRes$c1$CS2)],3),tail(rownames(pcaRes$c1)[order(pcaRes$c1$CS2)],6)
               )

#biplot
bi.plot = fviz_pca_biplot(pcaRes, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = chemi.environ$season[6:20], 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 6,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.4 , 
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                addEllipses = FALSE,
                #ellipse.type = "euclid",
                ellipse.level = 0.80,
                ellipse.alpha = 0.4,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Season", color = "Contribution", alpha = "Contribution") +
   #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1]))  +
  coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio))  + 
 #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4)  + 
    geom_text_repel(aes(label = as.character(chemi.environ$date.start[6:20])), size=4, nudge_y=1)  + 
 geom_segment(aes(xend=c(tail(pcaRes$li$Axis1, n=-1), NA), yend=c(tail(pcaRes$li$Axis2, n=-1), NA)),
      arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"),  linetype=1, size=1, alpha=0.7,color = c(rep("#3C5488FF",6), rep("#E64B35FF",9)))+
  theme_pubr(margin = FALSE, base_size = base.size) 
#use geom_segment to ensure there is an arrow for every segment
print(bi.plot)

bi.plot2 = fviz_pca_biplot(pcaRes, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = chemi.environ$location[6:20], 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 6,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.4 , 
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                #addEllipses = TRUE,
                #ellipse.type = "euclid",
                #ellipse.level = 0.60,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Location", color = "Contribution", alpha = "Contribution") +
 #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1]))  +
  coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio))   +
 #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4)  + 
    geom_text_repel(aes(label = as.character(chemi.environ$date.start[6:20])), size=4, nudge_y = 1)  + 
 geom_segment(aes(xend=c(tail(pcaRes$li$Axis1, n=-1), NA), yend=c(tail(pcaRes$li$Axis2, n=-1), NA)),color="#3C5488FF",
      arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"),  linetype=1, size=1, alpha=0.7)+
   scale_fill_d3(palette = "category20") +
  theme_pubr(margin = FALSE, base_size = base.size) 
#use geom_segment to ensure there is an arrow for every segment
print(bi.plot2)

bi.plot.clear = fviz_pca_biplot(pcaRes, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = chemi.environ$season[6:20], 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 6,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.4 , 
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                addEllipses = TRUE,
                #ellipse.type = "euclid",
                ellipse.level = 0.80,
                ellipse.alpha = 0.3,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Season", color = "Contribution", alpha = "Contribution") +
  #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1]))  +
  coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio))   +
 geom_segment(aes(xend=c(tail(pcaRes$li$Axis1, n=-1), NA), yend=c(tail(pcaRes$li$Axis2, n=-1), NA)),
      arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"),  linetype=1, size=1, alpha=0.7,color = c(rep("#4DBBD5FF",6), rep("#E64B35FF",9)))+
  theme_pubr(margin = FALSE, base_size = base.size) 
#use geom_segment to ensure there is an arrow for every segment
print(bi.plot.clear)

bi.plot2.clear = fviz_pca_biplot(pcaRes, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = chemi.environ$location[6:20], 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 6,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.4 , 
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                 addEllipses = TRUE,
                #ellipse.type = "euclid",
                ellipse.level = 0.80,
                ellipse.alpha = 0.3,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Location", color = "Contribution", alpha = "Contribution")+
  #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1]))  +
  coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio))  +
   scale_fill_npg() +
  geom_segment(aes(xend=c(tail(pcaRes$li$Axis1, n=-1), NA), yend=c(tail(pcaRes$li$Axis2, n=-1), NA)),color="#3C5488FF",
      arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"),  linetype=1, size=1, alpha=0.7)+
  theme_pubr(margin = FALSE, base_size = base.size) 
#use geom_segment to ensure there is an arrow for every segment
print(bi.plot2.clear)

bi.plot3.clear = fviz_pca_biplot(pcaRes, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = chemi.environ$location[6:20], 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 6,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.4 , 
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                addEllipses = TRUE,
                #ellipse.type = "euclid",
                ellipse.level = 0.80,
                ellipse.alpha = 0.3,
                invisible = "quali",
                alpha.ind = 0.8
                #mean.point = FALSE
                )+
  labs(fill = "Location", color = "Contribution", alpha = "Contribution")+
  #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1]))  +
  coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio))  +
   scale_fill_npg() +
  theme_pubr(margin = FALSE, base_size = base.size) 
#use geom_segment to ensure there is an arrow for every segment
print(bi.plot3.clear)

bi.plot4.clear = fviz_pca_biplot(pcaRes, 
                # Individuals
                geom.ind = "point",
                geom.var = c("arrow", "text"),
                fill.ind = chemi.environ$season[6:20], 
                #col.ind = fourpeople.select$ownership,
                select.var= list(name = c(names.biplot,chem.special)),
                pointshape = 21, pointsize = 6,
                palette = "npg",
                #addEllipses = TRUE,
                # Variables
                alpha.var =0.4 , 
                repel = TRUE,
                col.var = "contrib",
                gradient.cols = gradient.color,
                label.size = 5,
                addEllipses = TRUE,
                #ellipse.type = "euclid",
                ellipse.level = 0.80,
                ellipse.alpha = 0.3,
                invisible = "quali",
                #mean.point = FALSE
                )+
  labs(fill = "Season", color = "Contribution", alpha = "Contribution")+
  #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1]))  +
  coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio))  +
  theme_pubr(margin = FALSE, base_size = base.size) 
#use geom_segment to ensure there is an arrow for every segment
print(bi.plot4.clear)

plots = list(scree = scree, contrib = contrib, ind.1=ind.1, ind.1.clear = ind.1.clear, ind.1.path = ind.1.path, ind.2=ind.2, ind.2.clear = ind.2.clear, ind.3=ind.3, ind.4=ind.4, var.plot=var.plot, bi.plot=bi.plot, bi.plot2=bi.plot2, bi.plot.clear=bi.plot.clear, bi.plot2.clear = bi.plot2.clear, bi.plot3.clear = bi.plot3.clear, bi.plot4.clear=bi.plot4.clear)
return(plots)
}
#all features
#assign weights based on number of features between biotics and abiotics
col.weight = c(rep(sqrt(dim(chemi.profile.filter)[1]/dim(chem.species.profile.filter)[1]), dim(chem.species.profile.filter)[1]), rep(1, dim(chemi.profile.filter)[1]))

pcaRes = dudi.pca(combined.original[6:20,], scale = TRUE,scannf = FALSE, nf = 3, col.w = col.weight)
#ignore weights
pcaRes = dudi.pca(combined.original[6:20,], scale = TRUE,scannf = FALSE, nf = 3)

#cca features
pcaRes = dudi.pca(combined[6:20,], scale = TRUE,scannf = FALSE, nf = 3)

#DNA only
pcaRes = dudi.pca(t(chem.species.profile.filter)[6:20,], scale = FALSE,scannf = FALSE, nf = 3)

#chemical only
pcaRes = dudi.pca(t(chemi.profile.filter)[6:20,], scale = FALSE,scannf = FALSE, nf =3)


chem.plots = chempca(pcaRes, eclispe = 0.68, ratio = 6, base.size=14)


```
##c-means clustering of chemical stuff
```{r c-means cluster of chemicals}

#clustering based on season
#chemi_complete = cbind.data.frame(t.data.frame(chemi.profile[, 6:20]), season = chemi.environ[6:20,]$season)
#chemi.season = aggregate(. ~ season, chemi_complete, mean)
#chemi.season = chemi.season[, -1]

#clustering all
chemi.season = t.data.frame(chemi.profile[, 6:20])
chemi.season = chemi.season[, colSums(chemi.season)>0]
chemi.var = apply(chemi.season, 2, var)
var.threshold = quantile(chemi.var, 0.10) #quantile 10% variance filter
chemi.season = chemi.season[, apply(chemi.season,2,var)>=var.threshold] #variance filter
weight = c(apply(chemi.season, 2, var))
chemi.season.scale = scale(chemi.season)
rownames(chemi.season.scale) = chemi.environ[6:20,]$location
wtf = t(chemi.season.scale)

#determining the number of clusters
# Elbow method
fviz_nbclust(wtf, kmeans, method = "wss") +
    geom_vline(xintercept = 6, linetype = 2)+ #this is hardcoded
  labs(subtitle = "Elbow method")
# Silhouette method
fviz_nbclust(wtf, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")
# Gap statistic
# nboot = 50 to keep the function speedy. 
# recommended value: nboot= 500 for your analysis.
# Use verbose = FALSE to hide computing progression.
set.seed(123)
fviz_nbclust(wtf, kmeans, nstart = 25,  method = "gap_stat", nboot = 100)+
  labs(subtitle = "Gap statistic method")

set.seed(33) #for 5 clusters
cm = cmeans(wtf, center=5, iter.max=500)
table(cm$cluster)
#cm$cluster = factor(cm$cluster, levels=c(1:length(table(cm$cluster))))
fviz_cluster(list(data = wtf, cluster=cm$cluster), 
             geom = c("point"),
             ellipse = TRUE,
             ellipse.alpha = 0.6, #used to be 0.6 if only points are plotted.
             ellipse.type = "norm",
             ellipse.level = 0.68,
             palette = "npg",
             repel = TRUE) + theme_pubr(base_size = 18) + 
  scale_color_manual(values=c("1"= "#3C5488FF","2"="#4DBBD5FF","5"="#E64B35FF","4"="#00A087FF", "3"="#F39B7FFF"))+ 
  scale_fill_manual(values=c("1"= "#3C5488FF","2"="#4DBBD5FF","5"="#E64B35FF","4"="#00A087FF", "3"="#F39B7FFF"))
  
colnames(wtf) = c(1:15)
tempp = cbind.data.frame(wtf, weight=weight, cluster=cm$cluster, membership=apply(cm$membership, 1, max), taxon = row.names(wtf))
df.season.scale.group = tempp[tempp$membership>0.65,]
table(df.season.scale.group$cluster)
df.season.melt = melt(df.season.scale.group, id.vars = c("cluster","membership", "taxon","weight"))

#df.season.scale.group = df.season.scale[, cm$cluster==3 & cm$membership[,3]>=0.7]
#df.season.melt = melt(df.season.scale.group)
#colnames(df.season.melt) = c("season", "taxon", "value")

df.season.melt$variable = factor(df.season.melt$variable)
df.season.melt$cluster = factor(df.season.melt$cluster)

#+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))
#green, blue, red, dark blue
ggplot(data=df.season.melt, aes(x=variable, y=value, group=taxon, color=cluster, alpha=membership)) +
  #geom_line(stat="smooth",method = "loess", alpha=0.7)+
  geom_line(size=1)+
  geom_hline(yintercept = 0, linetype = 2)+
  scale_x_discrete(label = chemi.environ[6:20,]$location)+
  theme_pubr(base_size = 16, x.text.angle = 45) + labs(y="Scaled Values") + guides(alpha=FALSE, size=FALSE) + 
  scale_color_manual(values=c("1"= "#3C5488FF","2"="#4DBBD5FF","5"="#E64B35FF","4"="#00A087FF", "3"="#F39B7FFF"))+ 
  #scale_color_npg() +
  facet_wrap(~cluster, scales="free_y", ncol=5, strip.position = "top") + 
  theme(strip.background = element_blank(), strip.placement = "outside")

#==============================================================
#plotting chemicals with patterns similar to selected chemicals
chemi.season.names = colnames(chemi.res$r)[abs(chemi.res$r["PM1013", ])>=0.85]
chemi.season.names = colnames(chemi.res$r)[abs(chemi.res$r["PM0006", ])>=0.85]

#plotting chemicals with patterns similar to 3175 and 3177
chemi.season.names = colnames(chemi.res$r)[abs(chemi.res$r["PM3175", ])>=0.85|abs(chemi.res$r["PM3177", ])>=0.80]
df.season.scale.group = tempp[chemi.season.names,]
table(df.season.scale.group$cluster)
df.season.melt = melt(df.season.scale.group, id.vars = c("cluster","membership", "taxon","weight"))
df.season.melt$variable = factor(df.season.melt$variable)
df.season.melt$cluster = factor(df.season.melt$cluster)

ggplot(data=df.season.melt, aes(x=variable, y=value, color=cluster, group = taxon, alpha = membership)) +
  #geom_line(stat="smooth",method = "loess", size=1.5)+
  geom_line(size=1)+
  geom_hline(yintercept = 0, linetype = 2)+
  #geom_smooth(alpha=0.5, size=0.5, se = FALSE)+ 
  scale_x_discrete(label = chemi.environ[6:20,]$location)+
  theme_pubr(base_size = 16, x.text.angle = 45) + labs(y="Scaled Values") + guides(alpha=FALSE, size=FALSE) + scale_color_manual(values=c("1"= "#3C5488FF","2"="#4DBBD5FF","5"="#E64B35FF","4"="#00A087FF", "3"="#F39B7FFF"))


#examing cluster # of compounds of interets
chem.special = c("PM1888", "PM0594", "PM1527", "PM1013", "PM0006", "PM1180", "PM3175", "PM3177", "NM1772",
                 "PM0664", "NM0522", "NM1897", "PM1081", "NM2866", "PM1038", "PM3170", "PM3173")
for (item in chem.special) {
  print(cm$cluster[item])
}

```


##MDS section
```{r MDS of bray-curtis distance}
#MDS for chemi.species.profile
chemi.dist = vegdist(t(asinh(chem.species.profile)), na.rm = TRUE, method= "bray")
chemi.mds = cmdscale(chemi.dist,eig= TRUE)
out=data.frame(k=1:8,eig=chemi.mds$eig[1:8])
ggplot(data=out, aes(x=k, y=eig/sum(chemi.mds$eig)*100)) + geom_bar(stat="identity",width=0.5,color="orange",fill="pink") + labs( x= "Eigenfactors", y = "Percentage of variations explained") + theme_pubr()

#MDS
chemi.dist = vegan::vegdist(t(log(chemi.profile+1, base = 10)), na.rm = TRUE, method= "bray")
chemi.mds = cmdscale(chemi.dist,eig= TRUE)
out=data.frame(k=1:8,eig=chemi.mds$eig[1:8])
ggplot(data=out, aes(x=k, y=eig/sum(chemi.mds$eig)*100)) + geom_bar(stat="identity",width=0.5,color="orange",fill="pink") + labs( x= "Eigenfactors", y = "Percentage of variations explained") + theme_pubr()

MDS = data.frame(PCo1 = chemi.mds$points[,1], PCo2 = chemi.mds$points[,2])
#labs=rownames(chemi.mds$points))
ggplot(data = MDS, aes(x=PCo1, y=PCo2,color = chemi.environ$season, label = chemi.environ$comments ) ) + geom_point(size=3) + geom_text_repel() + theme_pubr()

#plotting mz distributions of PM and NM features
#1968 PM, 1331 NM
chemi.meta = chemi.meta[2:nrow(chemi.meta),]
chemi.meta = data.frame(chemi.meta, mode = c(rep("Positive", 1968), rep("Negative", 1331)))

ggdensity(chemi.meta, x = "mz",
   add = "median", 
   #add.params=list(color="red"),
   rug = TRUE,
   color = "mode", 
   fill = "mode",
   palette = c("#00AFBB", "#E7B800", "#3C5488FF"),
   ) + labs(x="m/z ratio") + theme_pubr(base_size = 16)
wilcox.test(chemi.meta$mz[1:1968], chemi.meta$mz[1969:length(chemi.meta$mz)])
```


##cheminfo-direct correlation test
```{r cheminfo-direct correlation test}
chem.special = c("PM1888", "PM0594", "PM1527", "PM1013", "PM0006", "PM1180", "PM3175", "PM3177", "NM1772",
                 "PM0664", "NM0522", "NM1897", "PM1081", "NM2866", "PM1038", "PM3170", "PM3173")
#pyridine.like = row.names(chemi.res$r)[abs(chemi.res$r["PM0006", ]) > 0.7]
#labels = chemiprofile$MetabID[match(row.names(chemi.res$r)[abs(chemi.res$r["PM0006", ]) > 0.7], chemiprofile$Primary.ID)]
#write.table(data.frame(ID = pyridine.like, label = labels, cor = chemi.res$r["PM0006", abs(chemi.res$r["PM0006", ]) > 0.7]), file = "~/Documents/Bioinfo/localanalysis/pyridine-like.txt", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "|")

chemi.interest = chemi.profile.filter[chem.special, 6:20]
chemi.all = chemi.profile.filter[, 6:20] #all chemicals
chemi.combined.small = rbind(chemi.interest, chem.species.profile.filter[, 6:20])
chemi.combined.all = rbind.data.frame(chemi.all, chem.species.profile.filter[, 6:20])
#cbind.data.frame is life, there also exists t.data.frame
chemi.combined.small.data = cbind.data.frame(t(chemi.combined.small), Location = as.character(chemi.environ$location[6:20]), Date=as.character(chemi.environ$date.start[6:20]), Season = chemi.environ$season[6:20])
chemi.combined.all.data = cbind.data.frame(t(chemi.combined.all), Location = as.character(chemi.environ$location[6:20]), Season = chemi.environ$season[6:20])
#chemi.combined.small.data$Locale[13] = "Davis-CA"

chemi.res = rcorr(t(chemi.combined.all), type="pearson") #all samples

chemi.res2 = rcorr(t(chemi.combined.small), type="pearson") 

#attempting to adjust p value, up to debate if this fits exploratory analysis
if(FALSE){
#adjusting only features 
temp.p = chemi.res2$P
p.adj.matrix = apply(temp.p, 1, function(x){ p.adjust(x, method="fdr")})


#adjusting all p values
temp.p = c(chemi.res2$P)
temp.p[is.na(temp.p)] = 0
temp.p.adj = p.adjust(temp.p, method = "fdr")
p.adj.matrix = matrix(temp.p.adj, nrow = ncol(chemi.res2$P[1:13,]), ncol = ncol(chemi.res2$P[1:13,]), dimnames = list(rownames = colnames(chemi.res2$P[1:13,]), colnames = colnames(chemi.res2$P[1:13,]))) 
}
#not adjusting
#p.adj.matrix = chemi.res2$P

#super slow warning
if (FALSE){
  
m = corrplot::corrplot(overallres$r,  p.mat = overallres$P, sig.level = 0.05, insig = "blank", tl.cex=0.6, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1))
}

m = corrplot::corrplot(chemi.res2$r, p.mat = p.adj.matrix, sig.level = 0.05, order = "hclust", insig = "blank", tl.cex=0.3, type="lower", method="square")
m = corrplot::corrplot(chemi.res2$r[1:15,], p.mat = p.adj.matrix[1:15,], sig.level = 0.05, insig = "blank", tl.cex=0.4, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1))

m = corrplot::corrplot(chemi.res2$r[1:17,], p.mat = p.adj.matrix[1:17,], sig.level = 0.05, insig = "blank", tl.cex=0.4, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1))

#this is a good selectoin procedure, anything correlating better than 0.6 with any of the chemical get selected
interest.df = chemi.res2$r[1:15, colSums(abs(chemi.res2$r[1:15,]) >= 0.6, na.rm = TRUE)> 0]
interest.df = chemi.res2$r[1:15, colSums(abs(chemi.res2$r[1:15,]) >= 0.7, na.rm = TRUE)> 0]
corrplot::corrplot(chemi.res2$r[colnames(interest.df),colnames(interest.df)], p.mat = p.adj.matrix[colnames(interest.df), colnames(interest.df)], sig.level = 0.05, insig = "blank",  tl.cex=0.5, method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1))

if (FALSE){ #for the additional seasonal chemicals
interest.df = chemi.res2$r[1:21, colSums(abs(chemi.res2$r[1:21,]) >= 0.6, na.rm = TRUE)> 0]
interest.df = chemi.res2$r[1:21, colSums(abs(chemi.res2$r[1:21,]) >= 0.7, na.rm = TRUE)> 0]
corrplot::corrplot(chemi.res2$r[colnames(interest.df),colnames(interest.df)], p.mat = p.adj.matrix[colnames(interest.df), colnames(interest.df)], sig.level = 0.05, insig = "blank",  tl.cex=0.5, method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1))
}

#checking if any chemicals are correlated with other chemicals/biologicals. Seems that there are TONS of chemical features that correlate with seasonal patterns in our dataset
colnames(chemi.res$r)[abs(chemi.res$r["PM3175", ])>=0.8|abs(chemi.res$r["PM3177", ])>=0.8]

#flower scent
#PM1038	Lilac aldehyde, (2R,2'R,5'R)-

#body scent
chemi.res$r[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")]
m = corrplot::corrplot(chemi.res$r[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], p.mat = chemi.res$P[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], sig.level = 0.05, insig = "blank", tl.cex=0.8, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,10,0.1,0.1))
m = corrplot::corrplot(chemi.res$r[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], p.mat = chemi.res$P[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], sig.level = 0.05, insig = "blank", tl.cex=0.8, type="upper", method = "number", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,10,0.1,0.1))
#mixed plot for values AND correlations.
m = corrplot::corrplot.mixed(chemi.res$r[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], p.mat = chemi.res$P[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], sig.level = 0.05, insig = "blank", tl.pos = "d",  tl.cex=0.8,lower = "number", upper = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(1,1,1,1))


# npg style
#+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))
#green, blue, red, dark blue

a = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM3177), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("PM3177"), limits=c(-2.5,2.5)) +
  theme_pubr(x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF"))

b = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM3175), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
   scale_y_continuous(name = paste0("PM3175"), limits=c(-2.5,2.5)) + scale_x_discrete(labels = chemi.combined.small.data$Location) +
    theme_pubr(x.text.angle= 45) + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF"))

b.2= ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM3175), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
   scale_y_continuous(name = paste0("M3175"), limits=c(-2,2)) +
    theme_pubr(x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_npg()

ggarrange(a, b, nrow =2, common.legend=TRUE, heights = c(1,1.7) )


c = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM1180), fill=Location )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("Geosmin"), limits=c(-2.5,2.5)) +
  theme_pubr(base_size = 14, x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_npg()

d = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM1888), fill=Location )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("Omethoate"), limits=c(-2.5,2.5)) +
  theme_pubr(base_size = 14,x.text.angle= 45)  + rremove("xlab") + rremove("x.text") + scale_fill_npg()

e =  ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM1013), fill=Location )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("Phthalate"), limits=c(-2.5,2.5)) +
  scale_x_discrete(labels = chemi.combined.small.data$Location) +
  theme_pubr(base_size = 14,x.text.angle= 45) + scale_fill_npg()

e.2 =  ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM1013), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("M1013"), limits=c(-3.5,3.5)) +
  theme_pubr(base_size = 14,x.text.angle= 45) + scale_fill_npg()

can = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Canis), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("Canis"), limits=c(-3.5,3.5)) +
  scale_x_discrete(labels = chemi.combined.small.data$Locale) +
  theme_pubr(x.text.angle= 45) + scale_fill_npg()


ggarrange(c,can, nrow =2, common.legend=TRUE, heights = c(1,1.7))

ggarrange(c, d, e, nrow =3, common.legend=TRUE, heights = c(1,1,1.7))

f = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM1527), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("DEET"), limits=c(-2.5,2.5)) +
  theme_pubr(base_size = 16, x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_npg()
  
g = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM0594), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("DEG"), limits=c(-2.5,2.5)) +
  theme_pubr(base_size = 16, x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_npg()

g.2 = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM0594), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("DEG"), limits=c(-2.5,2.5)) +
  scale_x_discrete(labels = chemi.combined.small.data$Locale) +
  theme_pubr(base_size = 16,x.text.angle= 45) + scale_fill_npg()

ggarrange(f,g.2, nrow =2, common.legend=TRUE, heights = c(1,1.7))
ggarrange(f,g,e.2, nrow =3, common.legend=TRUE, heights = c(1,1,2))


i = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Rhizopogon), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("Rhizopogon"), limits=c(-2.5,2.5)) +
  theme_pubr(x.text.angle= 45) + scale_fill_npg()

ggarrange(a, i, nrow =2, common.legend=TRUE, heights = c(1,1.7))


k = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Gelatoporia), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("Gelatoporia"), limits=c(-2.0,2.0)) +
    scale_x_discrete(labels = chemi.combined.small.data$Locale) +
  theme_pubr(x.text.angle= 45) + scale_fill_npg()
ggarrange(a, b.2, k, nrow =3, common.legend=TRUE, heights = c(1,1,2))

k = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Spirosoma), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("Spirosoma"), limits=c(-2.0,2.0)) +
    scale_x_discrete(labels = chemi.combined.small.data$Locale) +
  theme_pubr(x.text.angle= 45) + scale_fill_npg()
ggarrange(a, b.2, k, nrow =3, common.legend=TRUE, heights = c(1,1,2))


k.2 = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Castanea), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("Castanea"), limits=c(-2.0,2.0)) +
    scale_x_discrete(labels = chemi.combined.small.data$Locale) +
  theme_pubr(x.text.angle= 45) + rremove("xlab") + rremove("x.text")   +scale_fill_npg()

k.2 = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Pedobacter), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("Pedobacter"), limits=c(-2.0,2.0)) +
    scale_x_discrete(labels = chemi.combined.small.data$Locale) +
  theme_pubr(x.text.angle= 45) + rremove("xlab") + rremove("x.text")   +scale_fill_npg()
#seasonal chemicals and abiotics
ggarrange(a, b.2, k.2, k, nrow =4, common.legend=TRUE, heights = c(1,1,1,1.7))

l = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Dyadobacter), fill=Season )) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0("Dyadobacter"), limits=c(-2.0,2.0)) +
  scale_x_discrete(labels = chemi.combined.small.data$Locale) +
  theme_pubr(x.text.angle= 45) + scale_fill_npg()
ggarrange(a, b.2, l, nrow =3, common.legend=TRUE, heights = c(1,1,2))


#stacking plots together, stored in vector, labels are chemical names, biological names just stay the same
corplot.stacker = function(vector, labels = c(), chemical = "PM0006", cheminame = "Pyridine",  limit=3, base.size=16){
  plot.list=list()
for (i in 1:length(vector))
  local({
    item=vector[i]
    if(length(labels)){
      name = labels[i]
    } else { name = item }
    p1 = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(get(item)), fill=Location) ,environment = environment()) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0(name), limits=c(-limit,limit)) +
  theme_pubr(x.text.angle= 45) + 
    theme_pubr(base_size = base.size) + 
  rremove("xlab") + rremove("x.text") +
  scale_fill_npg()

plot.list[[item]] <<- p1 #super assignment, override scope
print(plot.list[[item]])
})

b.plot = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(get(chemical)), fill=Location) ,environment = environment()) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0(cheminame), limits=c(-limit,limit)) +
     scale_x_discrete(labels = chemi.combined.small.data$Location) +
  theme_pubr(base_size = base.size, x.text.angle= 45) + 
  rremove("xlab") +
  scale_fill_npg()  
plot.list[[cheminame]] = b.plot
  
  
  return(ggarrange(plotlist = plot.list, nrow = length(plot.list), common.legend = TRUE, heights = c(rep(1, length(plot.list)-1),1.7)))
}

plots.stack = corplot.stacker(colnames(chemi.res2$r)[abs(chemi.res2$r["PM0006", ])>0.7][-1], base.size = 16)

plots.stack.chems = corplot.stacker(vector = colnames(chemi.res2$r)[abs(chemi.res2$r["PM1888", ])>0.7][-1], 
                                    labels = c("Phthalate", "Geosmin", "Caprylic acid"), chemical = "PM1888", cheminame = "Omethoate")

plots.stack.show = corplot.stacker(vector = c("PM0594", "NM0522", "PM1527"), labels = c("DEG", "Caproic acid", "DEET"),
                                   chemical = "NM1772", cheminame = "Triclosan")

plots.stack.test = corplot.stacker(vector = c("PM3170", "PM3173", "PM3175"), labels =  c("PM3170", "PM3173", "PM3175"),
                                   chemical = "PM3177", cheminame = "PM3177")


plots.stack2 = corplot.stacker(c("Peniophora", "Sanghuangporus"), chemical = "PM0594", cheminame="DEG")

plots.stack2 = corplot.stacker(c("Trametes", "Peniophora", "Sanghuangporus"), chemical = "PM0664", cheminame="Caproic acid")                                  

#1 vs 1
corplot = function(a, b, cheminame = a, cheminame2 = b, limit=3){
a.plot = ggplot(chemi.combined.all, aes(x=Date, y=scale(get(a)), fill=Location) ,environment = environment()) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0(cheminame), limits=c(-limit,limit)) +
   scale_x_discrete(labels = chemi.combined.small.data$Location) +
  theme_pubr(margin = FALSE, base_size = 14, x.text.angle= 45) + 
  rremove("xlab") + rremove("x.text") +
  scale_fill_npg()

b.plot = ggplot(chemi.combined.all, aes(x=Date, y=scale(get(b)), fill=Location) ,environment = environment()) + geom_bar(stat="identity") + 
   geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(name = paste0(cheminame2), limits=c(-limit,limit)) +
     scale_x_discrete(labels = chemi.combined.small.data$Location) +
  theme_pubr(margin = FALSE, base_size = 14, x.text.angle= 45) +
  scale_fill_npg() + rremove("xlab")

stack.plot = ggarrange(a.plot, b.plot, nrow =2, common.legend=TRUE, heights = c(1,1.7))
return(stack.plot)
}

corplot("PM0006", "Gymnopus")
corplot("PM0594", cheminame = "DEG", "Peniophora", limit=2)
corplot("PM1038", cheminame = "Lilaic_acid", "Sanghuangporus", limit=3)

corplot("PM1180", cheminame = "Geosmin", b = "NM1897", cheminame2 = "Caprylic acid", limit=3)
corplot("PM0594", cheminame = "DEG", b = "PM0664", cheminame2 = "Caproic acid", limit=3)

ggplot(data.frame(chemi.combined.small.data), aes(x=as.numeric(PM0594), y=as.numeric(PM1527))) + geom_point() + geom_smooth(method = "lm") +    
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
                parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE)

ggplot(data.frame(t(chemi.combined.small.data)), aes(x=as.numeric(M1888), y=as.numeric(M1180))) + geom_point() + geom_smooth(method = "lm") +    
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
                parse = TRUE)

```
##chemi features direct comparisons
```{r chemi significant features between "campus" and others}
chemi.combined.all.data$group = ifelse(chemi.combined.all.data$Location %in% c("Campus","SF-CA"), "Campus", "Non-campus")
chemi.melt = melt(chemi.combined.all.data, id.vars = c("Location", "Season", "group"))
chemi.location.results = compare_means(formula = value ~ group, data = chemi.melt, group.by = "variable", p.adjust.method = "fdr")

chemi.season.results = compare_means(formula = value ~ Season, data = chemi.melt, group.by = "variable", p.adjust.method = "fdr")
```


##Correlation extraction from large matrix
```{r correlation extraction from large matrix}
#a function to extract correlation pairs that have values higher than a certain threshold
corextract = function(rcorr, threshold = 0.8, alpha = 0.05){
  #rcorr$r  #correlation matrix
  #rcorr$P  #p value matrix
  nameskeep = c() #store all names that satisfy the criteria
  for (x in rownames(rcorr$r)){
    for (y in rownames(rcorr$r)){
      #print(rcorr$r[x,y])
      #print(rcorr$P[x,y])
      if (abs(rcorr$r[x, y]) >= threshold & rcorr$P[x, y] <= alpha & !is.na(rcorr$P[x, y])) {
        nameskeep = c(nameskeep, x, y)
      }
    }
  }
  nameskeep = unique(nameskeep)
  return(list(r = rcorr$r[nameskeep, nameskeep], P = rcorr$P[nameskeep, nameskeep]))
}
chemi.select = corextract(chemi.res)
chemi.select.2 = corextract(chemi.res2)

kk =  corrplot::corrplot(chemi.select.2$r, p.mat = chemi.select.2$P, sig.level = 0.05, insig = "blank", tl.cex=0.3, type="lower", method="color")  
nn = corrplot::corrplot(chemi.select$r, p.mat = chemi.select$P, sig.level = 0.05, insig = "blank", tl.cex=0.3, type="lower", method="color")  

```

#World map plot
```{r worldmap plot}
library(ggmap)
#setting up world map of different detail level - watercolor
#mymap.big = get_stamenmap(bbox = c(-175, -50, 160, 71), zoom = 5, maptype= "watercolor")
#qmap.big = ggmap(mymap.big)

mymap.small = get_stamenmap(bbox = c(-175, -50, 160, 71), zoom = 4, maptype= "watercolor")
qmap.watercolor = ggmap(mymap.small)

#setting up world map of different detail level -
mymap.small.lite = get_stamenmap(bbox = c(-175, -50, 160, 71), zoom = 3, maptype= "toner-lite")
qmap.lite = ggmap(mymap.small.lite)

mymap.small.toner = get_stamenmap(bbox = c(-175, -50, 160, 71), zoom = 3, maptype= "toner")
qmap.toner = ggmap(mymap.small.toner)

#setting up world map of terrain
mymap.small.terrain = get_stamenmap(bbox = c(-175, -50, 160, 71), zoom = 3, maptype= "terrain")
qmap.terrain = ggmap(mymap.small.terrain)


#setting up NA map
geocode("USA")
na.map.terrain = get_stamenmap(bbox = c(-130, 23, -65, 53), maptype= "terrain", zoom = 4, force =TRUE)
namap.terrain = ggmap(na.map.terrain)

na.map.lite = get_stamenmap(bbox = c(-130, 23, -65, 53), maptype= "toner-lite", zoom = 4, force =TRUE)
namap.lite = ggmap(na.map.lite)

na.map.wc = get_stamenmap(bbox = c(-130, 23, -65, 53), maptype= "watercolor", zoom = 6, force =TRUE)
namap.wc = ggmap(na.map.wc)
#setting up bayarea map
stan = geocode("Stanford University")
stan = as.numeric(stan)
bayarea.map.terrain = get_stamenmap(bbox = c(stan[1]-0.4, stan[2]-0.3, stan[1]+0.4, stan[2]+0.5),  maptype= "terrain")
bmap.terrain = ggmap(bayarea.map.terrain)

bayarea.map.lite = get_stamenmap(bbox = c(stan[1]-0.4, stan[2]-0.3, stan[1]+0.4, stan[2]+0.5),  maptype= "toner-lite", force =TRUE)
bmap.lite = ggmap(bayarea.map.lite)

bayarea.map.wc = get_stamenmap(bbox = c(stan[1]-0.4, stan[2]-0.3, stan[1]+0.4, stan[2]+0.5),  maptype= "watercolor", force =TRUE, zoom=12)
bmap.wc = ggmap(bayarea.map.wc)
#ggsave("zoom4worldmap.pdf", qmap.small, width= 16, height = 12) 

p1.place = read.csv("~/Documents/Bioinfo/places/backup/patient1places.csv", header =TRUE, stringsAsFactors = FALSE)
p1.place = as.tibble(p1.place[, 1:7])
p3.place = read.csv("~/Documents/Bioinfo/places/backup/P3places.csv", header = TRUE, stringsAsFactors = FALSE)
p3.place = as.tibble(p3.place[, 1:7])
mike.place = read.csv("~/Documents/Bioinfo/places/backup/Mikeplaces.csv", header = TRUE, stringsAsFactors = FALSE)
mike.place = as.tibble(mike.place[, 1:7])

#environ.mike
#environ.no3
#environ.p1
#fitering place data according to filter data
datecompare = function(environ, place){
  select.list = c()
  #environ = environ.mike
  #place = mike.place
  for (i in 1:dim(environ)[1]){
    select = (as.Date(place$Date, "%m/%d/%y") >= as.Date(environ[i,]$date.start)) & (as.Date(place$Date, "%m/%d/%y") <= as.Date(environ[i,]$date.end))
  select.list = c(select.list, row.names(place)[select])
  #print(row.names(place)[select])
  }
  select.list = as.numeric(unique(select.list))
  #print(select.list)
  return( place[select.list, ])
}
mike.place.filtered = datecompare(environ.mike, mike.place)
p1.place.filtered = datecompare(environ.p1, p1.place)
p3.place.filtered = datecompare(environ.no3, p3.place)

# "#8491B4FF" grey, "#00A087FF" greenish.
#  scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))
#green, blue, red, dark blue
#shape 19 are dots, shape 1 are circles


#old testing chunk
if(FALSE){
qmap.small + geom_point(data = mike.place.filtered, aes(x=Longitude, y=Latitude), alpha = 0.1, color = "#00A087FF", shape = 1, size=3) + 
  geom_point(data = p1.place.filtered, aes(x=Longitude, y=Latitude), alpha = 0.1, color = "#E64B35FF", shape = 1, size=3) +
  geom_point(data = p3.place.filtered, aes(x=Longitude, y=Latitude), alpha = 0.1, color = "#3C5488FF", shape = 1, size=3)  + labs(x="Longitude", y="Latitude")
}
#ggsave("worldmap_watercolor.pdf", width=12, height=8)

#setting up regional maps for USA and Europe

#mapobj is a time-consuming step, should be done beforehand
plotmap = function(mapobj, point.shape = 19, point.size = 1.5, point.alpha=0.1, file.name = "map_output.pdf"){
  mapobj + geom_point(data = mike.place.filtered, aes(x=Longitude, y=Latitude), alpha = point.alpha, color = "#00A087FF", shape = point.shape, size=point.size) + 
  geom_point(data = p1.place.filtered, aes(x=Longitude, y=Latitude), alpha = point.alpha, color = "#E64B35FF", shape = point.shape, size=point.size) +
  geom_point(data = p3.place.filtered, aes(x=Longitude, y=Latitude), alpha = point.alpha, color = "#3C5488FF", shape = point.shape, size=point.size)  + labs(x="Longitude", y="Latitude")
  ggsave(paste0("~/Documents/Bioinfo/DNAformal/", file.name), width=8, height=6)
}

plotmap(qmap.lite, point.alpha = 0.1, point.shape = 19, file.name = "world.lite.pdf")

plotmap(qmap.terrain, point.shape = 1, point.alpha = 0.4, file.name = "world.terrain.pdf")

plotmap(bmap.terrain, point.alpha = 0.4, file.name = "bayarea.terrain.pdf", point.shape = 1)

plotmap(bmap.lite, point.alpha = 0.4, file.name = "bayarea.lite.pdf", point.shape = 19)

plotmap(bmap.wc, point.alpha = 0.4, file.name = "bayarea.wc.pdf", point.shape=19)

plotmap(namap.lite, point.alpha = 0.4, file.name = "na.lite.pdf", point.shape =19)

plotmap(namap.terrain, point.alpha = 0.4, file.name = "na.terrain.pdf", point.shape =1)

```
##MEM plotting and analyses
```{r MEM variables plotting and investigation}
#please load basic map from last chunck of code "world map plot"
# expo.xy was defined during sptail analysis
#MEM.select was defined in spatial variable construction
plot.MEM.select = MEM.select
colnames(plot.MEM.select) = c("MEM1", "MEM69", "MEM83", "MEM89", "MEM91", "MEM94")

plotmap.MEM = function(mapobj, point.shape = 21, point.alpha=0.4, file.name = "map_output.pdf"){
  for (item in colnames(plot.MEM.select)){
    item = item
    #geom_beeswarm could scatter multiples points on the same spot
  object704 = mapobj + geom_point(data = cbind.data.frame(expo.xy, plot.MEM.select), aes_string(x="longitude", y="latitude", fill=item), alpha = point.alpha, shape = point.shape, size=4) + 
    scale_fill_gradient2(low = muted("red"), mid = "white",
  high = muted("blue"), midpoint = 0, space = "Lab",
  na.value = "grey50", guide = "colourbar")+
   labs(x="Longitude", y="Latitude")
  
  print(object704)
  }
  #ggsave(paste0("~/Documents/Bioinfo/DNAformal/", file.name), width=8, height=6)
  return(object704)
}

object704 = plotmap.MEM(qmap.watercolor)

MEM.res = rcorr(plot.MEM.select, t(master.asinh.cpm.single), type="pearson")

#takes output from rcorr function, adjusting p value matrix accroding to fdr principles
matrixPadjust = function(res, select = c(1:nrow(res$P))){
  print(select)
  temp.p = c(res$P)
  temp.p[is.na(temp.p)] = 0
  temp.p.adj = p.adjust(temp.p, method = "fdr")
  p.adj.matrix = matrix(temp.p.adj, nrow = ncol(res$P), ncol=ncol(res$P), dimnames = list(rownames = colnames(res$P), colnames = colnames(res$P)))
  return(p.adj.matrix)
}
adjusted.res2.p = matrixPadjust(MEM.res)
#pls package also has a function called corrplot
corrplot::corrplot(MEM.res$r, order = "hclust", p.mat = adjusted.res2.p, sig.level = 0.01, insig = "blank", diag = FALSE, tl.cex=0.5, addrect = 12, method="color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black")


for (i in 1:6){
print(MEM.res$r[i,][abs(MEM.res$r[i,]) > 0.4])
print(adjusted.res2.p[i,][abs(MEM.res$r[i,]) > 0.4])
}

#coding difference values into geo regions
plot(plot.MEM.select[, "MEM94"], master.cpm.single["Ophiognomonia", ])

MEM94.cate <- cut(plot.MEM.select[, "MEM94"], 
                       breaks = c(-Inf, 0, 0.04, 0.06, Inf), 
                       labels = c("Campus", "Non-US",  "US" , "P2-location"), 
                       right = FALSE)
plot(MEM94.cate, master.asinh.cpm.single["Ophiognomonia", ])

table(MEM94.cate)
#check categories assignment
ggplot(cbind.data.frame(expo.xy, cate=MEM94.cate), aes(x = longitude, y=latitude, color = cate)) + geom_beeswarm(alpha=0.4) + theme_pubr() + scale_color_nejm()

ggplot(cbind.data.frame(category = MEM94.cate, Ophiognomonia=master.asinh.cpm.single["Ophiognomonia", ]), aes(x=category, y=Ophiognomonia, fill= category)) + geom_boxplot() + theme_pubr(base_size = 16) + scale_color_nejm()


plotmap.MEM94 = function(mapobj, point.shape = 21, point.alpha=0.4, file.name = "map_output.pdf"){
    #geom_beeswarm could scatter multiples points on the same spot
  object704 = mapobj + geom_beeswarm(data = cbind.data.frame(expo.xy, Ophiognomonia = master.asinh.cpm.single["Ophiognomonia", ], MEM94 = plot.MEM.select[, "MEM94"]), aes_string(x="longitude", y="latitude", fill="Ophiognomonia"), alpha = point.alpha, shape = point.shape, size=4) + 
    scale_fill_gradient2(low = muted("red"), mid = "white",
  high = muted("blue"), midpoint = 0, space = "Lab",
  na.value = "grey50", guide = "colourbar")+
   labs(x="Longitude", y="Latitude")
  
  print(object704)
  #ggsave(paste0("~/Documents/Bioinfo/DNAformal/", file.name), width=8, height=6)
  return(object704)
}

#object269 = plotmap.MEM94(bmap.wc)
#MEM83
#positive results from pearson correlations
MEM83.relevant = c("Rubritalea", "Anaerobacillus", "Flagellimonas", "Psychrilyobacter", "Ophiognomonia")
apply(master.asinh.cpm.single[MEM83.relevant,], 1, function(x){sum(x>0)})
for (item in MEM83.relevant[-5]){
plot(plot.MEM.select[, "MEM83"], master.asinh.cpm.single[item, ], ylab=item)
}
```

#Occasional exposure
```{r Occasional exposure}
#occasional exposure requires case by case validation, as such, one should be very careful when using species level results
species.list = c("Streptococcus pneumoniae", "Klebsiella pneumoniae", "Bacillus anthracis", "Dolosigranulum pigrum", "Acinetobacter baumannii", "Staphylococcus aureus", "Haemophilus influenzae", "Haemophilus parainfluenzae", "Peptoclostridium difficile", "Enterobacter cloacae",  "Clostridium perfringens","Gemella haemolysans")
#https://www.moldbacteria.com/mold-types.html
molds.list = c("Penicillium capsulatum", "Cladosporium sphaerospermum","Aureobasidium pullulans","Aspergillus fumigatus", "Aspergillus niger", "Stachybotrys chartarum", "Alternaria alternata",  "Fusarium fujikuroi", "Trichoderma asperellum")

claim.list = c("Phytophthora lateralis", "Metaseiulus occidentalis", "Pediculus humanus", "Dermatophagoides farinae","Tetranychus urticae",  "Rhagoletis zephyria", "Sphyracephala brevicornis", "Condylostylus patibulatus","Aedes albopictus", "Apis mellifera", "Blattella germanica",  "Adineta vaga")
#"Acyrthosiphon pisum"
#"Streptococcus suis",
#name checking for plot
#row.names(master.cpm.single)[grepl("Pseudomonas", row.names(master.cpm.single), ignore.case=TRUE)]
row.names(master.cpm.single)[grepl("aspergillus", row.names(master.cpm.single), ignore.case=TRUE)]

if(FALSE){
check_outlier <- function(v, coef=1.5){
  quantiles <- quantile(v,probs=c(0.25,0.75))
  IQR <- quantiles[2]-quantiles[1]
  res <- v < (quantiles[1]-coef*IQR)|v > (quantiles[2]+coef*IQR)
  return(res)
}
}
#for occaions exposures, use this function
check_outlier = function(v, coef = 0.1){
  threshold = max(v) * coef
  res = (v >= threshold)
  return(res)}


generalexpo = function(v = c(), profile, base.size=14){
  #profile = scores_single_complete
  #v = species.list
  .e = environment()
  expo.master.cpm = master.cpm.single[, profile$samplenames]
  expo.combined = cbind.data.frame(profile, t(expo.master.cpm))
  plot.list = list()
  for (item in v)
   local({
        item = item
        label = rep("", dim(profile)[1])
label[check_outlier(expo.combined[,item])] = as.character(profile$location[check_outlier(expo.combined[,item])])
        p = ggplot(expo.combined, aes(x = date.mid, y = get(item),  label=label, fill=aownership, size = get(item)))
    plot = p + geom_point(aes(color = season, fill=aownership),  alpha=0.6, shape = 21) + geom_text_repel(show.legend = F)+
      theme_pubr(base_size = base.size, margin = FALSE) + 
      scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) + 
      scale_fill_d3(palette = "category20") + 
      scale_size(range=c(4,7)) +  #adjusting the default range to make dots and labels more visible
      theme(axis.text.x = element_text(size = 12)) +
      #scale_shape_manual(values= c(21,22,23,24)) +
      labs(x = "", y = item, fill = "Individual") + guides(size = FALSE)
    #print(plot)
    plot.list[[item]] <<- plot #super assignment, override scope
    print(plot.list[[item]])
})
  return(plot.list)
}
  
generalexpo.list = generalexpo(species.list, scores_single_complete, base.size = 16)
stack = ggarrange(plotlist =  generalexpo.list, ncol = 3, nrow=4, legend="none", align = "v")

#for molds
molds.figures = generalexpo(molds.list, scores_single_complete, base.size =16)
stack.molds = ggarrange(plotlist = molds.figures, ncol = 3, nrow=3, legend="none", align = "v")

#for claims on animals
arthro.figures = generalexpo(claim.list, scores_single_complete, base.size=16)
stack.arthro = ggarrange(plotlist = arthro.figures, ncol = 3, nrow=4, legend="none", align = "v")

honey_bee_samples = c(colnames(master.asinh.cpm.single)[master.asinh.cpm.single["Apis", ] > 0])
scores_single_complete$season[which(scores_single_complete$samplenames %in% honey_bee_samples)]

```


#popgene
```{r popgene}
virus.stats = read.table(file="~/Documents/Bioinfo/popgene/topviruses/topviruses.analysis", sep = "\t", stringsAsFactors = FALSE, fill=TRUE, header=TRUE)
virus.stats = cbind.data.frame(virus.stats, name=virus.stats.temp$V2)
virus.stats = virus.stats[order(virus.stats$name),]
virus.stats.filter = virus.stats[virus.stats$Effective_length/virus.stats$Total_length >= 0.4,]
virus.stats.filter = virus.stats.filter[order(virus.stats.filter$SNP),]
plot(x=virus.stats.filter$SNP,y=virus.stats.filter$Raw_coverage)
plot(x=virus.stats.filter$SNP, y=virus.stats.filter$PI)

#overview plots
ggplot(virus.stats.filter, aes(x = SNP, y = PI, color = name)) + geom_point(size = 4, alpha=0.6) + geom_smooth(aes(group=1),method="lm", se=FALSE)  + 
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~"), group=1), 
                parse = TRUE) +
  scale_color_d3("category20") +
  theme_pubr(base_size = 14, legend = "right", border=TRUE, margin =FALSE) 


ggplot(virus.stats.filter, aes(x = SNP, y = Raw_coverage, color = name)) + geom_point(size = 4, alpha=0.6)  + 
  scale_color_d3("category20") + ylab("Coverage_depth")+
  theme_pubr(base_size = 14, legend = "right", border=TRUE, margin = FALSE)


ggplot(virus.stats.filter, aes(x = SNP, y = Effective_length/Total_length, color = name)) + geom_point(size = 4, alpha=0.6)   + 
  scale_color_d3("category20") + ylab("Coverage_width") +
  theme_pubr(base_size = 14, legend = "right", border=TRUE, margin = FALSE)

snp = ggplot(virus.stats.filter, aes(x = name, y = SNP, color = name)) + geom_point(size = 4, alpha=0.6)   + 
  scale_color_d3("category20") + labs(y="SNP/Kbp", x="Species") +
  theme_pubr(base_size = 10, legend = "none", border=TRUE, margin = FALSE, x.text.angle = 45)

pi  = ggplot(virus.stats.filter, aes(x = name, y = PI, color = name)) + geom_point(size = 4, alpha=0.6)   + 
  scale_color_d3("category20") + labs(y="PI", x="Species")  +
  theme_pubr(base_size = 10, legend = "none", border=TRUE, margin = FALSE, x.text.angle = 60) + rremove("xlab") + rremove("x.text") 

#stacked
ggarrange(plotlist = list(pi, snp), nrow=2, heights= c(1,1.8))

#getting prevalence information for each species, now virus
prevalence = c()
for (item in virus.stats.filter$name){
  prevalence = c(prevalence, assign(item,sum(master.cpm.single[item, ]>0)))
}
              
```
##popgeneoverall
```{r overall population genetics}
virus.stats = read.table(file="~/Documents/Popgenetics/topviruses.final.analysis", sep = "|", stringsAsFactors = FALSE, header=TRUE)
fungi.stats =read.table(file="~/Documents/Popgenetics/topfungi.final.analysis", sep = "|", stringsAsFactors = FALSE, header=TRUE)
bacteria.stats = read.table(file="~/Documents/Popgenetics/topbacteria.final.analysis", sep = "|", stringsAsFactors = FALSE, header=TRUE)
misc.stats = read.table(file="~/Documents/Popgenetics/topmisc.final.analysis", sep = "|", stringsAsFactors = FALSE, header=TRUE)
virus.stats = cbind(virus.stats, type = "Viruses")
fungi.stats = cbind(fungi.stats, type = "Fungi")
bacteria.stats = cbind(bacteria.stats, type = "Bacteria")
misc.stats = cbind(misc.stats, type = "Others")
virus.stats = virus.stats[order(virus.stats$COV),]
fungi.stats = fungi.stats[order(fungi.stats$COV),]
#bacteria.stats = bacteria.stats[order(bacteria.stats$Species, bacteria.stats$COV),]
bacteria.stats = bacteria.stats[order(bacteria.stats$COV),]
misc.stats = misc.stats[order(misc.stats$COV),]

all.stats = rbind.data.frame(misc.stats, fungi.stats, bacteria.stats, virus.stats)
all.stats$Species = factor(all.stats$Species, levels = c(all.stats$Species))
all.stats$type = factor(all.stats$type, levels = c("Bacteria", "Fungi", "Viruses", "Others"))

#> colnames(all.stats)
#[1] "Species" "SNV"     "PI"      "COV"     "ESIZE"   "SIZE"    "type"   
# c("Bacteria", "Fungi", "Viridiplantae", "Metazoa","Alveolata", "Archaea")
#taxoncolor = c("#F8766D", "#B79F00", "#00BA38", "#00BFC4", "#619CFF", "#F564E3")

point.size = 7
basesize = 34
#6 was fine
COV = ggplot(all.stats, aes(x = Species, y= log(COV, base = 10), fill= type)) + geom_point(shape = 21, alpha=0.8, size =point.size) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = basesize, x.text.angle = 60) + labs(fill = "Kingdom/Subkingdom", y = "Coverage") + theme(axis.text.x = element_text(size = 10)) + scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + geom_hline(yintercept = 2, linetype = 2, color = "#737373") + geom_hline(yintercept = 2.7, linetype = 2, color = "#737373") +rremove("xlab")

SNV = ggplot(all.stats, aes(x = Species, y= log(SNV, base = 10), fill= type, group=type)) + geom_point(shape = 21, alpha=0.8, size = point.size) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = basesize) + labs(fill = "Kingdom/Subkingdom", y = "SNPs/Kbp") + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + scale_color_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + geom_hline(yintercept = 2, linetype = 2, color = "#737373")  + geom_hline(yintercept = 1, linetype = 2, color = "#737373") + 
  geom_smooth(data = all.stats[-c(1,2),], aes(color=type), method = "lm" ,show.legend = FALSE, se = FALSE) + 
  geom_smooth(aes(group=1), color = "#737373", alpha = 0.6, method = "lm", show.legend=FALSE, se=FALSE)

PI = ggplot(all.stats, aes(x = Species, y= log(PI, base = 10), fill= type)) + geom_point(shape = 21, alpha=0.8, size =point.size) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = basesize) + labs(fill = "Kingdom/Subkingdom", y = expression(pi)) +
  rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) +geom_smooth(aes(group=1), color = "#737373", alpha = 0.6,method = "lm", show.legend=FALSE, se=FALSE)

ESIZE = ggplot(all.stats, aes(x = Species, y= log(ESIZE, base = 10), fill= type)) + geom_point(shape = 21, alpha=0.8, size =point.size ) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = basesize) + labs(fill = "Kingdom/Subkingdom", y = "Effective Size") + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + geom_hline(yintercept = 6, linetype = 2, color = "#737373") + geom_hline(yintercept = 3, linetype = 2, color = "#737373")

SIZE = ggplot(all.stats, aes(x = Species, y= log(SIZE, base = 10), fill= type)) + geom_point(shape = 21, alpha=0.8, size =point.size ) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = basesize) + labs(fill = "Kingdom/Subkingdom", y = "Reference Size") + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + geom_hline(yintercept = 6, linetype = 2, color = "#737373") + geom_hline(yintercept = 3, linetype = 2, color = "#737373")

ggarrange(SNV, PI, SIZE, COV, nrow = 4, heights = c(1,1,1,2), common.legend = TRUE, align = "v")

#save as 16*40


#supplmentary plots
#SNV vs PI
ggplot(all.stats, aes(x=SNV, y=PI, color=type)) + geom_point() + geom_smooth(method = "lm") +    
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..rr.label.., sep = "~~~")), 
                parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) + stat_fit_glance(method = 'lm',
                  method.args = list(formula = y~x),
                  geom = 'text',
                  aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")),
                  label.x.npc = 'right', label.y.npc = 0.2, size = 3) + labs(x = "SNPs/kbp", y = expression(pi), color ="Domain")

#COV vs SNV all
ggplot(all.stats, aes(x=log(COV, base=10), y=log(SNV, base=10), color=type, group = 1)) + 
              geom_point() + 
              geom_smooth(aes(group=1), method = "lm") +    
    stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..rr.label.., sep = "~~~")), 
                parse = TRUE) + 
  theme_pubr(base_size = 14, margin = FALSE) + 
  labs(x = "log(Coverage)", y = "log(SNPs/kbp)", color ="Domain") + 
  stat_fit_glance(method = 'lm',
                  method.args = list(formula = y~x),
                  geom = 'text',
                  aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")),
                  label.x.npc = 'right', label.y.npc = 0.2, size = 3)


#COV vs SNV faceted
ggplot(all.stats, aes(x=log(COV, base=10), y=log(SNV, base=10), color=type)) + geom_point() + geom_smooth(method = "lm") +    
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..rr.label.., sep = "~~~")), 
                parse = TRUE) + 
  theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Coverage)", y = "log(SNPs/kbp)", color ="Domain") + 
  stat_fit_glance(method = 'lm',
                  method.args = list(formula = y~x),
                  geom = 'text',
                  aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")),
                  label.x.npc = 'right', label.y.npc = 0.2, size = 3) +
facet_wrap(~type, scales="free", ncol=2)

#ESIZE vs SNV
ggplot(all.stats, aes(x=log(ESIZE, base=10), y=log(SNV, base=10))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") +    
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..rr.label.., sep = "~~~")), 
                parse = TRUE) + 
  theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Effective genome size)", y = "log(SNPs/kbp)", color ="Domain")+
  stat_fit_glance(method = 'lm',
                  method.args = list(formula = y~x),
                  geom = 'text',
                  aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")),
                  label.x.npc = 'left', label.y.npc = 0.9, size = 4) #+
#facet_wrap(~type, scales="free", ncol=2)

#SIZE vs SNV
ggplot(all.stats, aes(x=log(SIZE, base=10), y=log(SNV, base=10))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") +    
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..rr.label.., sep = "~~~")), 
                parse = TRUE) + 
  theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Reference genome size)", y = "log(SNPs/kbp)", color ="Domain")+
  stat_fit_glance(method = 'lm',
                  method.args = list(formula = y~x),
                  geom = 'text',
                  aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")),
                  label.x.npc = 'left', label.y.npc = 0.9, size = 4) #+


#ESIZE vs PI
ggplot(all.stats, aes(x=log(ESIZE, base=10), y=log(PI))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") +    
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..rr.label.., sep = "~~~")), 
                parse = TRUE) + 
  theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Effective genome size)", y = expression(log(pi)), color ="Domain")+
  stat_fit_glance(method = 'lm',
                  method.args = list(formula = y~x),
                  geom = 'text',
                  aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")),
                  label.x.npc = 'left', label.y.npc = 0.9, size = 4) #+
#ESIZE vs SIZE
ggplot(all.stats, aes(x=log(ESIZE, base=10), y=log(SIZE, base=10))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") +    
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..rr.label.., sep = "~~~")), 
                parse = TRUE) + 
  theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Effective genome size)", y = "log(Reference genome size)", color ="Domain")+
  stat_fit_glance(method = 'lm',
                  method.args = list(formula = y~x),
                  geom = 'text',
                  aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")),
                  label.x.npc = 'left', label.y.npc = 0.85, size = 4) #+
#coverage vs effective size
ggplot(all.stats, aes(x=log(ESIZE, base=10), y=log(COV, base=10))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") +    
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..rr.label.., sep = "~~~")), 
                parse = TRUE) + 
  theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Effective genome size)", y = "log(Coverage)", color ="Domain")+
  stat_fit_glance(method = 'lm',
                  method.args = list(formula = y~x),
                  geom = 'text',
                  aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")),
                  label.x.npc = 'left', label.y.npc = 0.85, size = 4) #+

#coverage vs reference size  looks like reference size correlates better than effective genome size
ggplot(all.stats, aes(x=log(SIZE, base=10), y=log(COV, base=10))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") +    
  stat_poly_eq(formula = y~x,
                #eq.with.lhs = "italic(y)~`=`~",
                aes(label = paste(..rr.label.., sep = "~~~")), 
                parse = TRUE) + 
  theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Reference genome size)", y = "log(Coverage)", color ="Domain")+
  stat_fit_glance(method = 'lm',
                  method.args = list(formula = y~x),
                  geom = 'text',
                  aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")),
                  label.x.npc = 'left', label.y.npc = 0.85, size = 4) #+


#REGRESSION TEST
model1 = lm(log(SNV) ~ log(SIZE) + log(COV), data = all.stats)
model2 = lm(log(SNV) ~ log(COV) + log(SIZE), data = all.stats)

#bacteria-plasmids
bac.stats = all.stats[all.stats$type == "Bacteria", ]
bac.stats.special = bac.stats[grep("Klebsiella pneumoniae|Nostoc punctiforme|Salmonella enterica|Staphylococcus haemolyticus", bac.stats$Species),]
spec.order = c(7,1,8,10,12,15,11, 16,14,13, 3,4, 9, 2, 6,5)
bac.stats.special = bac.stats.special[spec.order,]
bac.stats.special$genus = c(rep("Klebsiella", 6), rep("Staphylococcus", 4), rep("Salmonella", 2), rep("Nostoc", 4))
bac.stats.special$form = c("genome", rep("plasmid", 5), "genome", rep("plasmid", 3), "genome", "plasmid", "genome", rep("plasmid", 3))
bac.stats.special$Species = factor(bac.stats.special$Species, levels=c(as.character(bac.stats.special$Species)))

point.size = 9
#6 was fine
COV = ggplot(bac.stats.special, aes(x = Species, y= log(COV, base = 10), fill= genus, shape = form)) + geom_point( alpha=0.8, size =point.size) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = 28, x.text.angle = 60) + labs(fill = "Strain", y = "Coverage") + theme(axis.text.x = element_text(size = 10)) + scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + scale_shape_manual(values=c(21,23))

SNV = ggplot(bac.stats.special, aes(x = Species, y= log(SNV, base = 10), fill= genus, shape = form)) + geom_point(alpha=0.8, size =point.size) + theme_pubr(base_size = 28) + labs(fill = "Strain", y = "SNP/Kbp") + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3"))  + scale_shape_manual(values=c(21,23))#+ geom_hline(yintercept = 2, linetype = 2, color = "#737373")  + geom_hline(yintercept = 1, linetype = 2, color = "#737373") 
  
PI = ggplot(bac.stats.special, aes(x = Species, y= log(PI, base = 10), fill= genus, shape = form)) + geom_point( alpha=0.8, size =point.size) + theme_pubr(base_size = 28) + labs(fill = "Strain", y = expression(pi)) +
  rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + scale_shape_manual(values=c(21,23))

ESIZE = ggplot(bac.stats.special, aes(x = Species, y= log(ESIZE, base = 10), fill= genus, shape = form)) + geom_point(alpha=0.8, size =point.size )  + theme_pubr(base_size = 28) + labs(fill = "Kingdom/Subkingdom", y = "Effective Size") + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + scale_shape_manual(values=c(21,23))

ggarrange(SNV, PI, ESIZE, COV, nrow = 4, heights = c(1,1,1,2), common.legend = TRUE, align = "v")

bac.fit = lm(SNV ~ COV + ESIZE, data = bac.stats.special)


```

#generating color strips annotation for iTOL tree
```{r color strips annotation for iTOL tree}
virus.host = read.csv("~/Documents/Bioinfo/analysis/For_Chao_Virus-Host copy.csv", header = TRUE, stringsAsFactors = FALSE)
viruses.id = read.table("~/Documents/Bioinfo/analysis/viruses.id.txt", col.names = "NAME", stringsAsFactors = FALSE)
manual.check = viruses.id$NAME[!viruses.id$NAME %in% virus.host$Name]
# c("Bacteria", "Fungi", "Viridiplantae", "Metazoa","Alveolata", "Archaea")
taxoncolor = c("#F8766D", "#B79F00", "#00BA38", "#00BFC4", "#619CFF", "#F564E3")
#Arthropods 0065C4 
#Helminth B5F000
#other eukaryotes 8E00EB
#Protoza EB00E4
#Homo sapiens EB272A
virus.host$color = virus.host$Kingdom
#getting custom colors
temp.colors = brewer.pal(n = 9, name = "Set1")
temp.colors[c(2,4,9,7,6,3)] = taxoncolor
virus.host$color = mapvalues(virus.host$color, from = sort(unique(virus.host$Kingdom)), to = temp.colors)

output.virus.df = data.frame(name=virus.host$Name, color=virus.host$color, color2=paste0("COL",virus.host$color))
#write.table(output.virus.df, "~/Documents/Bioinfo/analysis/virus.colors.txt", row.names = FALSE, quote = FALSE, sep = " ")
#checking colors and labels
ggplot(data.frame(a=1:9, b=1:9, color=temp.colors, label=sort(unique(virus.host$Kingdom))), aes(x=a, y=b, color=label, label=label)) + geom_point(size =8) + scale_color_manual(values=temp.colors)
```
##generating iTOL tree with relative abundance at species level
```{r all species tree}
#assuming data at species level is loaded
write.table(x=row.names(master.asinh.cpm.single)[rowSums(master.asinh.cpm.single)>0], file="~/Documents/Bioinfo/analysis/allspecies.txt", row.names=FALSE, quote = FALSE)
allspecies.tax = read.table("~/Documents/Bioinfo/analysis/allspecies.tax2.txt", sep="|", header = TRUE, strip.white = TRUE, stringsAsFactors = FALSE)
for (i in 1:nrow(allspecies.tax)){ #replace all species names with preferred names so phytol can be used
  if (allspecies.tax$preferred.name[i] != ""){
    print(allspecies.tax$preferred.name[i])
    allspecies.tax$name[i] = allspecies.tax$preferred.name[i]
  }}
#write.table(allspecies.tax$name, "~/Documents/Bioinfo/analysis/allspecies.tax.fix.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)

#setting up asinh.cpm counts 
select = rownames(master.asinh.cpm.single)[rowSums(master.asinh.cpm.single)>0]
relative.counts = rowSums(master.asinh.cpm.single)[select[-1]]
#ureplace space with underscore to be consistent with phytol output.
newnames<-gsub(" ", "_", allspecies.tax$name, fixed=TRUE)
relative.counts.df= data.frame(name = newnames, abundance = relative.counts)
relative.counts.df$name[2492] = "Ostreococcus_'lucimarinus'"
#write.table(relative.counts.df, "~/Documents/Bioinfo/analysis/allspecies.relativecounts.txt", row.names = FALSE, quote = FALSE, sep = " ")

#total abundance plots for fun
ggplot(relative.counts.df, aes(x=abundance)) + geom_density() + labs(x="Normalized CPM") + theme_pubr() 
```
# Getting new interactions from database global interactions
```{r interactions extraction}
library(rglobi)
#assuming species name here, genus would work too
output.df = data.frame(source = character(), target = character(), interactiontype = character(), sourcetaxon= character(), targettaxon=character())
for (name in c("Homo sapiens", row.names(master.cpm.single))){
  #print(name)
  temp.result = get_interactions_by_taxa(sourcetaxon = name)
  output.df = rbind.data.frame(output.df, data.frame(source = temp.result$source_taxon_name, target = temp.result$target_taxon_name, interactiontype = temp.result$interaction_type, sourcetaxon=temp.result$source_taxon_path, targettaxon = temp.result$target_taxon_path))
  output.df = output.df[!duplicated(output.df),]
}

write.table(output.df, "~/Documents/bioinfo/localanalysis/interactions.all.csv", row.names = FALSE, col.names = TRUE, quote=FALSE, sep =",")
output.match.df = output.df[output.df$source %in% c("Homo sapiens", row.names(master.cpm.single)) & output.df$target %in% c("Homo sapiens", row.names(master.cpm.single)), ]
write.table(output.match.df, "~/Documents/bioinfo/localanalysis/interactions.match.csv", row.names = FALSE, col.names = TRUE, quote=FALSE, sep =",")
length(unique(output.df$source))
#some interactions are just called interactswith, which is not good enough
#limit to cases where source is like "cargo", while target is like "carrier" for integration
output.match.defined.df = output.match.df[output.match.df$interactiontype != "interactsWith" & output.match.df$interactiontype %in% c("hasHost", "parasiteOf", "pathogenOf", "symbiontOf", "pollinates"),]
write.table(output.match.defined.df, "~/Documents/bioinfo/localanalysis/interactions.match.defined.csv", row.names = FALSE, quote=FALSE, sep=",")
output.match.defined.df$sourceclass = output.match.defined.df$sourcetaxon
output.match.defined.df$targetclass = output.match.defined.df$targettaxon
taxonfinder = function(v){
  name.list = c()
  for (name in v){
      if (grepl("Fungi|Ascomycota|Basidiomycota|Stereum|Laetiporus|Phoma herbarum", name)){
      tempclass = "Fungi"
      } else if (grepl("Plantae", ignore.case = TRUE,name)){
      tempclass = "Higher Plants"
      } else if (grepl("Bacteria|Bacillus|Vibrio",ignore.case = TRUE, name)){
      tempclass = "Bacteria"
      } else if (grepl("Arthropoda",ignore.case = TRUE, name)){
      tempclass = "Arthropods"
      } else if (grepl("Nematoda",ignore.case = TRUE, name)){
      tempclass = "Helminth"
      } else if (grepl("Homo sapiens",ignore.case = TRUE,name)){
      tempclass = "Homo sapiens"
      } else if (grepl("Chordata|Mollusca", ignore.case = TRUE,name)){
        #print("TRUE")
      tempclass = "Animals"
      } else if (grepl("Protozoa",ignore.case = TRUE, name)){
      tempclass = "Protozoa"
      } else if (grepl("Oomycota|Alveolata|Leishmania|Schistosoma",ignore.case = TRUE, name)){
      tempclass = "Other eukaryotes"
      } else {
      tempclass = name}
      name.list = c(name.list, tempclass)
  }
  return(name.list)
}
sourceclass = taxonfinder(output.match.defined.df$sourceclass)
table(sourceclass)
targetclass = taxonfinder(output.match.defined.df$targetclass)
table(targetclass)
output.match.defined.df$sourceclass = sourceclass
output.match.defined.df$targetclass =targetclass

#reoder
output.final.df = output.match.defined.df[,c(1,6,2,7,3)]

#filter things that have the same target and source
output.final.df = output.final.df[!output.final.df$source == output.final.df$target,]
#fix werid  interations
output.final.df[output.final.df$source == "Pinus taeda", c(1,2,3,4)] = output.final.df[output.final.df$source == "Pinus taeda", c(3,4,1,2)]
output.final.df[output.final.df$source == "Pinus lambertiana", c(1,2,3,4)] = output.final.df[output.final.df$source == "Pinus lambertiana", c(3,4,1,2)]
output.final.df[output.final.df$source == "Pseudotsuga menziesii", c(1,2,3,4)] = output.final.df[output.final.df$source == "Pseudotsuga menziesii", c(3,4,1,2)]
output.match.df$interactiontype %in% c("hasHost", "parasiteOf", "pathogenOf", "symbiontOf", "pollinates"),]
output.final.df = output.final.df[!duplicated(output.final.df),]
#this is the maxmial number of edges(nodes) one could have from total datasets, so for other situations, one only need to filter through this one
write.csv(output.final.df, "~/Documents/bioinfo/localanalysis/interactions.edges.csv", row.names = FALSE, quote=FALSE)
#getting nodes
nodes.names = rbind.data.frame(data.frame(id = output.final.df$source, class = output.final.df$sourceclass), data.frame(id = output.final.df$target, class = output.final.df$targetclass))
nodes.final.names = nodes.names[!duplicated(nodes.names),]
write.csv(nodes.final.names, "~/Documents/bioinfo/localanalysis/interactions.nodes.csv", row.names = FALSE, quote=FALSE)

write.table(output.match.defined.df, "~/Documents/bioinfo/localanalysis/interactions.defined.csv", row.names = FALSE, col.names = TRUE, quote=FALSE, sep =",")


#integrate the cargo-carrier network
cc.df =  read.table("~/Documents/Bioinfo/species-location-interaction_data/SpeciesInteractions_EID2.txt", header = TRUE, stringsAsFactors = FALSE, sep="\t")
#this filters interactions that both counterparts can be FOUND in the list of names, which is all species
cc.match.df = cc.df[cc.df$Cargo %in% tolower(c("Homo sapiens", row.names(master.cpm.single))) & cc.df$Carrier %in% tolower(c("Homo sapiens", row.names(master.cpm.single))),]

#this filter remove edges that are already in the global interaction databases, turns out only a few interactions are left, which means the global interactions are a superior database
cc.final.df = cc.match.df[!((cc.match.df$Cargo %in% tolower(output.final.df$source) & cc.match.df$Carrier %in% tolower(output.final.df$target)) | (cc.match.df$Carrier %in% tolower(output.final.df$source) & cc.match.df$Cargo %in% tolower(output.final.df$target))),]
write.csv(cc.final.df[,c(1,2,3,4)], "~/Documents/Bioinfo/localanalysis/supplementary.edges.csv", quote=FALSE, row.names=FALSE)

#two results from two databases were integrated to generate the newinteractins.finaldb.csv file!!
```
## Getting nodes and edges file from any profile using new database.
```{r new exposome cloud}
#generated from previous chunck
new.db = read.csv("~/Documents/Bioinfo/localanalysis/newinteractions.finaldb.csv", header = TRUE,  
stringsAsFactors = FALSE)

#This funciton ASSUMES master.cpm.single is generated at SPECIES level!!!!, the database is only compatible with species level names
getcloud = function(profile, prefix="interactions"){
  #profile = scores_single_complete
  temp.df = master.cpm.single[, profile$samplenames]
  names = c("Homo sapiens", row.names(temp.df)[rowSums(temp.df)>10])
  filtered.df = new.db[new.db$source %in% names & new.db$target %in% names,]
  #write.csv(filtered.df[, c(1,2,3,4)],file=paste0("~/Documents/Bioinfo/localanalysis/", prefix, "-edges.csv"), quote=FALSE, row.names = FALSE)
  print(paste0(nrow(filtered.df), " edges"))
  #build nodes table
  names.df = rbind.data.frame(data.frame(id = filtered.df$source, label = filtered.df$source, class=filtered.df$sourceclass),
                              data.frame(id = filtered.df$target, label = filtered.df$target, class = filtered.df$targetclass))
  names.df = names.df[!duplicated(names.df),]
  #write.csv(names.df, file=paste0("~/Documents/Bioinfo/localanalysis/", prefix, "-nodes.csv"),  quote=FALSE, row.names=FALSE)
  print(paste0(nrow(names.df), " nodes"))
  
  #getting human and environment related names
  human_related.df = subset(filtered.df, sourceclass == "Homo sapiens" | targetclass == "Homo sapiens")
  human_animal_related.df = subset(filtered.df, sourceclass == "Homo sapiens" | targetclass == "Homo sapiens"| sourceclass == "Animals" | targetclass == "Animals") #lumping human and animal together
  human_related.names = unique(human_related.df$source)
  human_animal_related.names = unique(ifelse(human_animal_related.df$targetclass == "Homo sapiens" | human_animal_related.df$targetclass == "Animals", human_animal_related.df$source, human_animal_related$target))
  
  
  plant_related.df = subset(filtered.df, sourceclass == "Higher Plants" & targetclass != "Higher Plants" | targetclass == "Higher Plants" & sourceclass != "Higher Plants")
  #lumpin plants and arthropods
  plant_arthro_related.df = subset(filtered.df, sourceclass == "Higher Plants" & targetclass != "Higher Plants" | targetclass == "Higher Plants" & sourceclass != "Higher Plants" | targetclass == "Arthropods")
  
  plant_related.names = unique(ifelse(plant_related.df$targetclass == "Higher Plants", plant_related.df$source, plant_related.df$target))
  plant_arthro_related.names = unique(ifelse(plant_arthro_related.df$targetclass == "Higher Plants" | plant_arthro_related.df$targetclass == "Arthropods", plant_arthro_related.df$source, plant_related.df$target))
  
  #core versions of dedicated speciesd names
  human_only = human_related.names[!(human_related.names %in% plant_related.names)]
  plant_only = plant_related.names[!(plant_related.names %in% human_related.names)]
  human_intersect_plant = plant_related.names[plant_related.names %in% human_related.names]
  
  #expanded versions of dedicated species names
  human_animal_only = human_animal_related.names[!(human_animal_related.names %in% plant_arthro_related.names)]
  plant_arthro_only = plant_arthro_related.names[!(plant_arthro_related.names %in% human_animal_related.names)]
  human_animal_intersect_plant_arthro = human_animal_related.names[human_animal_related.names %in% plant_arthro_related.names]
  
  return(list(plant_related = plant_related.names, plant_arthro_related =plant_arthro_related.names, human_related = human_related.names, human_animal_related = human_animal_related.names, plant_only = plant_only, human_only = human_only, plant_arthro_only = plant_arthro_only, human_animal_only = human_animal_only, human_intersect_plant = human_intersect_plant, human_animal_intersect_plant_arthro = human_animal_intersect_plant_arthro))
}
#all samples
allsamples.names = getcloud(scores_single_complete, "allsamples")

#mike OR P1
P1.names = getcloud(Mike_single, "Mikeall")
#P2
P2.names = getcloud(Patient1, "P2")
#P3
P3.names = getcloud(Patient3, "P3")

#fourpeople
getcloud(fourpeople.select[fourpeople.select$ownership == "Mike",], "mike_fourpeople")
getcloud(fourpeople.select[fourpeople.select$ownership == "Gw",], "gw")
getcloud(fourpeople.select[fourpeople.select$ownership == "Guan_SF",], "sf")
getcloud(fourpeople.select[fourpeople.select$ownership == "No3_San Mateo",], "p3_fourpeople")


getname = function(profile){
  temp.df = master.cpm.single[, profile$samplenames]
  names = row.names(temp.df)[rowSums(temp.df)>10]
  print(head(rowSums(temp.df[names,]), 20))
  return(names)
}
mike.names = getname(fourpeople.select[fourpeople.select$ownership == "Mike",])
gw.names = getname(fourpeople.select[fourpeople.select$ownership == "Gw",])
guan.names = getname(fourpeople.select[fourpeople.select$ownership == "Guan_SF",])
no3.names = getname(fourpeople.select[fourpeople.select$ownership == "No3_San Mateo",])


#===================examing variances
#filtering master.asinh.cpm.single for features
master.asinh.cpm.single.forvar = master.asinh.cpm.single[apply(master.asinh.cpm.single, 1, function(x){sum(x>0) >= 50}), ]
master.asinh.cpm.season.forvar = cbind.data.frame(t(master.asinh.cpm.single.forvar[-1,]), season=scores_single_complete$season)

df.season.forvar.fin = t(master.asinh.cpm.single[apply(master.asinh.cpm.single, 1, function(x){sum(x>0) >= 50}), ])

df.season.forvar.fin = df.season.forvar.fin[, colSums(df.season.forvar.fin)>0]


human_var = apply(df.season.forvar.fin[, allsamples.names$human_only[allsamples.names$human_only %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0])})
#these fungi are not really human associated
human_var = human_var[!grepl("Aspergillus|Cladosporium|Penicillium", names(human_var))]
names(human_var)
l1 = length(human_var)

plant_var = apply(df.season.forvar.fin[, allsamples.names$plant_only[allsamples.names$plant_only %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0])})
l2 = length(plant_var)

human_intersect_plant_var = apply(df.season.forvar.fin[, allsamples.names$human_intersect_plant[allsamples.names$human_intersect_plant %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0])})
l3 = length(human_intersect_plant_var)

var.df = data.frame(variance = c(human_var,plant_var, human_intersect_plant_var ), group = c(rep("Human", l1), rep("Plant", l2), rep("Plant/Human", l3)))
comparisons = list(c("Human", "Plant"), c("Plant", "Plant/Human"), c("Human", "Plant/Human"))

var.df$grouop = factor(var.df$group, levels = c("Plant", "Plant/Human", "Human"))

ggplot(var.df, aes(x=group, y=log(variance), fill=group)) + geom_boxplot() + stat_compare_means(method = "wilcox.test", comparisons = comparisons) + theme_pubr(base_size = 16) + scale_fill_manual(values=c("Plant" = "#20854EFF", "Plant/Human" = "#E18727FF", "Human" = "#BC3C29FF")) + labs(x="")
#, label = "p.signif"
#0.0380 0.0285 0.2600
ggplot(var.df, aes(x=group, y=log(variance), fill=group)) + geom_boxplot() + stat_compare_means(method = "wilcox.test", comparisons = comparisons, label = "p.signif") + theme_pubr(base_size = 16) + scale_fill_manual(values=c("Plant" = "#20854EFF", "Plant/Human" = "#E18727FF", "Human" = "#BC3C29FF")) + labs(x="")


#===================examing variances for broad groups


human_animals_var = apply(df.season.forvar.fin[, allsamples.names$human_animal_only[allsamples.names$human_animal_only %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0], na.rm=TRUE)})
human_animals_var = human_animals_var[!grepl("Aspergillus|Cladosporium|Penicillium", names(human_animals_var))]
names(human_animals_var)
ll1 = length(human_animals_var)


plant_arthro_var = apply(df.season.forvar.fin[,allsamples.names$plant_arthro_only[allsamples.names$plant_arthro_only %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0], na.rm=TRUE)})
names(plant_arthro_var)
ll2 = length(plant_arthro_var)

human_animals_inter_plant_arthro_var = apply(df.season.forvar.fin[, allsamples.names$human_animal_intersect_plant_arthro[allsamples.names$human_animal_intersect_plant_arthro %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0], na.rm=TRUE)})
names(human_animals_inter_plant_arthro_var)
ll3 = length(human_animals_inter_plant_arthro_var)

var2.df = data.frame(variance = c(human_animals_var, plant_arthro_var, human_animals_inter_plant_arthro_var), group = c(rep("Human/Animals", length(human_animals_var)), rep("Plant/Arthropods", length(plant_arthro_var)), rep("Intersection", length(human_animals_inter_plant_arthro_var))))
var2.df$group = factor(var2.df$group, levels = c("Plant/Arthropods", "Intersection", "Human/Animals")) #order of box-plots
comparisons = list(c("Human/Animals", "Plant/Arthropods"), c("Plant/Arthropods", "Intersection"), c("Human/Animals", "Intersection"))

ggplot(var2.df, aes(x=group, y=log(variance), fill=group)) + geom_boxplot() + stat_compare_means(method = "wilcox.test", comparisons = comparisons) + theme_pubr(base_size = 16, x.text.angle = 30) + scale_fill_manual(values=c("#20854EFF", "#E18727FF", "#BC3C29FF")) + labs(x="") + guides(group=FALSE)
#0.0260 0.0219 0.3500
#, label = "p.signif"
ggplot(var2.df, aes(x=group, y=log(variance), fill=group)) + geom_boxplot()  + stat_compare_means(label.y = 0) + theme_pubr(base_size = 20, x.text.angle = 30) + scale_fill_manual(values=c("#20854EFF", "#E18727FF", "#BC3C29FF")) + labs(x="") + guides(group=FALSE) + rremove(object = "x.text")

p.adjust(c(0.0073, 0.013, 0.35), method = "fdr")
#ggplot(var2.df, aes(x=group, y=log(variance), fill=group)) + geom_violin() + geom_boxplot(width = 0.3,  fill = "white") + theme_pubr(base_size = 20, x.text.angle = 30) + scale_fill_manual(values=c("#20854EFF", "#E18727FF", "#BC3C29FF")) + labs(x="") + guides(group=FALSE)

```
#contig identit alignemnt etc. plots
```{r contigs}
bacteria.contigs = read.csv("~/Documents/Expo_paper/bacteria.identity.result.csv", header = TRUE)
animal.contigs = read.csv("~/Documents/Expo_paper/animals.result.csv", header = TRUE)
molds.contigs = read.csv("~/Documents/Expo_paper/molds.result.csv", header = TRUE)

#bacteria
#identity
p = ggplot(bacteria.contigs, aes(x = strain, y=identity, color=strain, alpha = alignment)) + geom_boxplot() +  theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 90, linetype = 2, color = "#737373") + geom_hline(yintercept = 95, linetype = 2, color = "#737373") + labs(y = "Percentage Identity") 
#bit score
q = ggplot(bacteria.contigs, aes(x = strain, y=log(bitscore, base =10), color=strain, alpha = alignment)) + geom_boxplot() +  theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 2, linetype = 2, color = "#737373") + labs(y = "log(bitscore, base=10)")
#alignment
r = ggplot(animal.contigs, aes(x = strain, y=alignment, color=strain, alpha = alignment)) + geom_boxplot() +  theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 0.5, linetype = 2, color = "#737373") + labs(y = "Alignment percentage")


#animals
#identity
#align x axis to exposure plot
animal.contigs$strain = factor(animal.contigs$strain, levels=c(gsub(" ", "_", claim.list, fixed=TRUE)))

p = ggplot(animal.contigs, aes(x = strain, y=identity, color=strain, alpha = alignment)) + geom_boxplot() +  theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 90, linetype = 2, color = "#737373")+ labs(y = "Percentage Identity") + rremove("xlab")
#bit score
q = ggplot(animal.contigs, aes(x = strain, y=log(bitscore, base =10), color=strain, alpha = alignment)) + geom_boxplot() +  theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 2, linetype = 2, color = "#737373") + labs(y = "log(bitscore, base=10)")+ rremove("xlab")
#alignment
r = ggplot(animal.contigs, aes(x = strain, y=alignment, color=strain, alpha = alignment)) + geom_boxplot() +  theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 0.5, linetype = 2, color = "#737373") + labs(y = "Alignment percentage")+ rremove("xlab")


#molds
p = ggplot(molds.contigs, aes(x = strain, y=identity, color=strain, alpha = alignment)) + geom_boxplot() +  theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 90, linetype = 2, color = "#737373")+ labs(y = "Percentage Identity") + rremove("xlab")
#bit score
q = ggplot(molds.contigs, aes(x = strain, y=log(bitscore, base =10), color=strain, alpha = alignment)) + geom_boxplot() +  theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 2, linetype = 2, color = "#737373") + labs(y = "log(bitscore, base=10)")+ rremove("xlab")
#alignment
r = ggplot(molds.contigs, aes(x = strain, y=alignment, color=strain, alpha = alignment)) + geom_boxplot() +  theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 0.5, linetype = 2, color = "#737373") + labs(y = "Alignment percentage")+ rremove("xlab")


```
# word cloud output
```{r wordcloud}
library(scales)
master.cpm.single =master.cpm.single[order(rowSums(master.cpm.single), decreasing=TRUE),]
values = log(head(rowSums(master.cpm.single),100))
values = round(scales::rescale(values, to = c(2,12)))
write.table(data.frame(value = values, name = head(row.names(master.cpm.single), 100)), "~/Documents/Bioinfo/names.table.txt", quote=FALSE, row.names = FALSE)

```

#area plot testing
```{r areaplot}
#create data
set.seed(3)
 
#time steps
t.step<-seq(0,20)
 
#group names
grps<-letters[1:10]
 
#random data for group values across time
grp.dat<-runif(length(t.step)*length(grps),5,15)
 
#create data frame for use with plot
grp.dat<-matrix(grp.dat,nrow=length(t.step),ncol=length(grps))
grp.dat<-data.frame(grp.dat,row.names=t.step)
names(grp.dat)<-grps

p.dat<-data.frame(step=row.names(grp.dat),grp.dat,stringsAsFactors=F)
p.dat<-melt(p.dat,id='step')
p.dat$step<-as.numeric(p.dat$step)

p<-ggplot(p.dat,aes(x=step,y=value))

p1<-p + geom_area(aes(fill=variable)) + theme(legend.position="bottom")

p2<-p + geom_area(aes(fill=variable),position='fill')
```
# graph-based approach test
```{r graph-based method}
library("igraph")
library("phyloseq")
library("phyloseqGraphTest")
library("ggnetwork")
library("intergraph")
ps = readRDS("~/Documents/statscourse/data/ps1.rds")
net = make_network(ps, max.dist=0.35)
sampledata = data.frame(sample_data(ps))
V(net)$id = sampledata[names(V(net)), "host_subject_id"]
V(net)$litter = sampledata[names(V(net)), "family_relationship"]

ggplot(net, aes(x=x, y=y, xend=xend, yend=yend), layout = "fruchtermanreingold") +
  geom_edges(color = "darkgrey") +
  geom_nodes(aes(color=id, shape=litter)) +
  theme_pubr(legend = "right") +
  theme(axis.text = element_blank(), axis.title = element_blank(), legend.key.height =unit(0.6, "line")) +
  guides(col = guide_legend(override.aes = list(size =0.25))) 


#MST and Jaccard
gt = graph_perm_test(ps, sampletype = "family_relationship", grouping = "host_subject_id", distance = "jaccard", type = "mst", nperm = 200)
gt$pval
plot_test_network(gt) #+ theme(legend.text = element_text(size=8),
                              #legend.title = element_text(size=9))
plot_permutations(gt)

#MST and bray
gt = graph_perm_test(ps, sampletype = "family_relationship", grouping = "host_subject_id", distance = "bray", type = "mst", nperm = 200)
gt$pval
plot_test_network(gt) #+ theme(legend.text = element_text(size=8),
                              #legend.title = element_text(size=9))
plot_permutations(gt)

#NNJ and bray
gt = graph_perm_test(ps, sampletype = "family_relationship", grouping = "host_subject_id", distance = "bray", type = "knn", knn = 2, nperm = 200)
gt$pval
plot_test_network(gt) #+ theme(legend.text = element_text(size=8),
                              #legend.title = element_text(size=9))
plot_permutations(gt)
```
##graph-based application on exposome data
```{r graph-based application on exposome data}
library("igraph")
library("phyloseq")
library("phyloseqGraphTest")
library("ggnetwork")
library("intergraph")
#set up otu matrix
otumat = master.asinh.cpm.single
#random taxon matrix for now. will update to real one later
taxmat = matrix(sample(letters, 7*nrow(otumat), replace = TRUE), nrow = nrow(otumat), ncol = 7)
rownames(taxmat) <- rownames(otumat)
colnames(taxmat) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
taxmat
#set up sampledata

scores
sampledata = scores_single_complete
sampledata$country[is.na(sampledata$country)] = "USA"

row.names(sampledata) = scores_single_complete$samplenames
sampledata = sample_data(sampledata)

ps.expo = phyloseq(otu_table(otumat, taxa_are_rows =TRUE), tax_table(taxmat), sampledata)

if(FALSE){
net = make_network(ps.expo, max.dist=0.7)
sampledata = data.frame(sample_data(ps.expo))
V(net)$id = sampledata[names(V(net)), "aownership"]
V(net)$geo = sampledata[names(V(net)), "geo2"]

ggplot(net, aes(x=x, y=y, xend=xend, yend=yend), layout = "fruchtermanreingold") +
  geom_edges(color = "darkgrey") +
  geom_nodes(aes(color=id, shape=geo)) +
  theme_pubr(legend = "right") +
  theme(axis.text = element_blank(), axis.title = element_blank(), legend.key.height =unit(0.6, "line")) +
  guides(col = guide_legend(override.aes = list(size =0.25))) 
}


#owner based
gt1 = graph_perm_test(ps.expo, "aownership", distance = "jaccard", type = "mst", nperm = 500)
gt1$pval
plot_test_network(gt1)
plot_permutations(gt1)


gt2 = graph_perm_test(ps.expo, "aownership", distance = "bray", type = "mst" , nperm = 500)
gt2$pval
plot_test_network(gt2)
plot_permutations(gt2)

gt3 = graph_perm_test(ps.expo, "aownership", distance = "bray", type = "knn", nperm = 500)
gt3$pval
plot_test_network(gt3)
plot_permutations(gt3)

gt3 = graph_perm_test(ps.expo, "aownership", distance = "bray", type = "knn", nperm = 500, knn = 2)
gt3$pval
plot_test_network(gt3)
plot_permutations(gt3)

#season based
gt1 = graph_perm_test(ps.expo, "season", distance = "jaccard", type = "mst", nperm = 500)
gt1$pval
plot_test_network(gt1)
plot_permutations(gt1)


gt2 = graph_perm_test(ps.expo, "season", distance = "bray", type = "mst" , nperm = 9999)
gt2$pval
plot_test_network(gt2) + theme(legend.text=element_text(size=14)) + scale_color_manual(values=c("#E64B35FF","#00A087FF","#4DBBD5FF","#3C5488FF")) 
plot_permutations(gt2) + theme_pubr(base_size = 16)

gt3 = graph_perm_test(ps.expo, "season", distance = "bray", type = "knn", nperm = 500)
gt3$pval
plot_test_network(gt3)
plot_permutations(gt3)

gt3 = graph_perm_test(ps.expo, "season", distance = "bray", type = "knn", nperm = 500, knn = 2)
gt3$pval
plot_test_network(gt3)
plot_permutations(gt3)

#location based
gt4 = graph_perm_test(ps.expo, "geo2", distance = "jaccard", type = "mst", nperm = 500)
gt4$pval
plot_test_network(gt4)
plot_permutations(gt4)

gt4 = graph_perm_test(ps.expo, "geo2", distance = "bray", type = "mst" , nperm = 500)
gt4$pval
plot_test_network(gt4)
plot_permutations(gt4)

gt5 = graph_perm_test(ps.expo, "geo2", distance = "bray", type = "knn", nperm = 500)
gt5$pval
plot_test_network(gt5)
plot_permutations(gt5)

gt5 = graph_perm_test(ps.expo, "geo2", distance = "bray", type = "knn", nperm = 500, knn = 2)
gt5$pval
plot_test_network(gt5)
plot_permutations(gt5)

#location based geo3

gt5 = graph_perm_test(ps.expo, "geo3", distance = "bray", type = "mst", nperm = 500)
gt5$pval
plot_test_network(gt5)
plot_permutations(gt5)

gt5 = graph_perm_test(ps.expo, "geo3", distance = "bray", type = "knn", nperm = 500, knn = 2)
gt5$pval
plot_test_network(gt5)
plot_permutations(gt5)

#locaiton based country
gt5 = graph_perm_test(ps.expo, "country", distance = "bray", type = "mst", nperm = 500)
gt5$pval
plot_test_network(gt5)
plot_permutations(gt5)

gt5 = graph_perm_test(ps.expo, "country", distance = "bray", type = "knn", nperm = 500, knn = 2)
gt5$pval
plot_test_network(gt5)
plot_permutations(gt5)

gt5 = graph_perm_test(ps.expo, "country", distance = "bray", type = "knn", nperm = 500, knn = 1)
gt5$pval
plot_test_network(gt5)
plot_permutations(gt5)


graph_comp = function(profile, nperm=500, basesize = 20){
  profile = scores_single_complete
  otumat = master.asinh.cpm.single[, c(profile$samplenames)]
  #random taxon matrix for now. will update to real one later
  taxmat = matrix(sample(letters, 7*nrow(otumat), replace = TRUE), nrow = nrow(otumat), ncol = 7)
  rownames(taxmat) <- rownames(otumat)
  colnames(taxmat) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
  taxmat
  #set up sampledata

  sampledata = profile
  sampledata$country[is.na(sampledata$country)] = "USA"

  row.names(sampledata) = profile$samplenames
  sampledata = sample_data(sampledata)

  ps.expo = phyloseq(otu_table(otumat, taxa_are_rows =TRUE), tax_table(taxmat), sampledata)
  
  #owner based
  gt = graph_perm_test(ps.expo, "aownership", distance = "jaccard", type = "mst", nperm = nperm)
  print(gt$pval)
  owner_ja_mst = plot_test_network(gt)  + theme(legend.text=element_text(size=basesize))
  print(owner_ja_mst)
  owner_ja_mst_permu = plot_permutations(gt) + ggtitle(paste0("p value = ", signif( gt$pval, digits = 4))) + theme_pubr(base_size = basesize)
  print(owner_ja_mst_permu)
  
  gt2 = graph_perm_test(ps.expo, "aownership",  distance = "bray", type = "mst" , nperm = nperm)
  print(gt2$pval)
  owner_bray_mst = plot_test_network(gt2)  + theme(legend.text=element_text(size=basesize))
  print(owner_bray_mst)
  owner_bray_mst_permu = plot_permutations(gt2) + ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(owner_bray_mst_permu)
  
  gt3 = graph_perm_test(ps.expo, "aownership", distance = "bray", type = "knn", nperm = nperm)
  print(gt3$pval)
  owner_bray_knn = plot_test_network(gt3)  + theme(legend.text=element_text(size=basesize))
  print(owner_bray_knn)
  owner_bray_knn_permu = plot_permutations(gt3) + ggtitle(paste0("p value = ", signif( gt3$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(owner_bray_knn_permu)

  #season based
  gt1 = graph_perm_test(ps.expo, "season", distance = "jaccard", type = "mst", nperm = nperm)
  print(gt1$pval)
  season_jac_mst = plot_test_network(gt1) + theme(legend.text=element_text(size=basesize)) + scale_color_manual(values=c("#E64B35FF","#00A087FF","#4DBBD5FF","#3C5488FF")) 
  print(season_jac_mst)
  season_jac_mst_permu = plot_permutations(gt1) + ggtitle(paste0("p value = ", signif( gt1$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(season_jac_mst_permu)

  gt2 = graph_perm_test(ps.expo, "season", distance = "bray", type = "mst" , nperm = nperm)
  print(gt2$pval)
  season_bray_mst = plot_test_network(gt2) + theme(legend.text=element_text(size=basesize)) + scale_color_manual(values=c("#E64B35FF","#00A087FF","#4DBBD5FF","#3C5488FF")) 
  print(season_bray_mst)
  season_bray_mst_permu = plot_permutations(gt2)+ ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(season_bray_mst_permu)

  gt3 = graph_perm_test(ps.expo, "season", distance = "bray", type = "knn", nperm = nperm)
  print(gt3$pval)
  season_bray_knn = plot_test_network(gt3) + theme(legend.text=element_text(size=basesize)) + scale_color_manual(values=c("#E64B35FF","#00A087FF","#4DBBD5FF","#3C5488FF")) 
  print(season_bray_knn)
  season_bray_knn_permu = plot_permutations(gt3)+ ggtitle(paste0("p value = ", signif( gt3$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(season_bray_knn_permu)

  #location based
  gt1 = graph_perm_test(ps.expo, "geo2", distance = "jaccard", type = "mst", nperm = nperm)
  print(gt1$pval)
  location_jac_mst = plot_test_network(gt1)  + theme(legend.text=element_text(size=basesize))
  print(location_jac_mst)
  location_jac_mst_permu = plot_permutations(gt1)+ ggtitle(paste0("p value = ", signif( gt1$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(location_jac_mst_permu)
  
  gt2 = graph_perm_test(ps.expo, "geo2", distance = "bray", type = "mst" , nperm = nperm)
  print(gt2$pval)
  location_bray_mst = plot_test_network(gt2)  + theme(legend.text=element_text(size=basesize))
  print(location_bray_mst)
  location_bray_mst_permu = plot_permutations(gt2)+ ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(location_bray_mst_permu)
  
  gt3 = graph_perm_test(ps.expo, "geo2", distance = "bray", type = "knn", nperm = nperm)
  print(gt3$pval)
  location_bray_knn = plot_test_network(gt3)  + theme(legend.text=element_text(size=basesize))
  print(location_bray_knn)
  location_bray_knn_permu = plot_permutations(gt3) + ggtitle(paste0("p value = ", signif( gt3$pval, digits = 4))) + theme_pubr(base_size = basesize)
  print(location_bray_knn_permu)
  #location based geo3 inland and coastal

  gt1 = graph_perm_test(ps.expo, "geo3", distance = "bray", type = "mst", nperm = nperm)
  print(gt1$pval)
  coastal_bray_mst = plot_test_network(gt1)  + theme(legend.text=element_text(size=basesize))
  print(coastal_bray_mst)
  coastal_bray_mst_permu = plot_permutations(gt1) + ggtitle(paste0("p value = ", signif( gt1$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(coastal_bray_mst_permu)
  
  gt2 = graph_perm_test(ps.expo, "geo3", distance = "bray", type = "knn", nperm = nperm, knn = 1)
  print(gt2$pval)
  coastal_bray_knn = plot_test_network(gt2)  + theme(legend.text=element_text(size=basesize))
  print(coastal_bray_knn)
  coastal_bray_knn_permu = plot_permutations(gt2) + ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(coastal_bray_knn_permu)
  
  #locaiton based country
  gt1 = graph_perm_test(ps.expo, "country", distance = "bray", type = "mst", nperm = nperm)
  print(gt1$pval)
  country_bray_mst = plot_test_network(gt1)  + theme(legend.text=element_text(size=basesize))
  print(country_bray_mst)
  country_bray_mst_permu = plot_permutations(gt1) + ggtitle(paste0("p value = ", signif( gt1$pval, digits = 4)))+ theme(legend.text=element_text(size=basesize))
  print(country_bray_mst_permu)
    
  gt2 = graph_perm_test(ps.expo, "country", distance = "bray", type = "knn", nperm = nperm, knn = 1)
  print(gt2$pval)
  country_bray_knn = plot_test_network(gt2)  + theme(legend.text=element_text(size=basesize))
  print(country_bray_knn)
  country_bray_knn_permu = plot_permutations(gt2) + ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4))) + theme(legend.text=element_text(size=basesize))
  print(country_bray_knn_permu)
  
  #batch
   gt1 = graph_perm_test(ps.expo, "batch", distance = "bray", type = "mst", nperm = nperm)
  print(gt1$pval)
  batch_bray_mst = plot_test_network(gt1)  + theme(legend.text=element_text(size=basesize))
  print(batch_bray_mst)
  batch_bray_mst_permu = plot_permutations(gt1) + ggtitle(paste0("p value = ", signif( gt1$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(batch_bray_mst_permu)
    
  gt2 = graph_perm_test(ps.expo, "batch", distance = "bray", type = "knn", nperm = nperm, knn = 1)
  print(gt2$pval)
  batch_bray_knn = plot_test_network(gt2)  + theme(legend.text=element_text(size=basesize))
  print(batch_bray_knn)
  batch_bray_knn_permu = plot_permutations(gt2) + ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4)))+ theme_pubr(base_size = basesize)
  print(batch_bray_knn_permu)
  
  return(list = list(owner_ja_mst,
                     owner_ja_mst_permu, 
                     owner_bray_mst,
                     owner_bray_mst_permu,
                     owner_bray_knn,
                     owner_bray_knn_permu,
                     season_jac_mst,
                     season_jac_mst_permu,
                     season_bray_mst,
                     season_bray_mst_permu,
                     season_bray_knn,
                     season_bray_knn_permu,
                     location_jac_mst,
                     location_jac_mst_permu,
                     location_bray_mst,
                     location_bray_mst_permu,
                     location_bray_knn,
                     location_bray_knn_permu,
                     coastal_bray_mst,
                     coastal_bray_mst_permu,
                     coastal_bray_knn,
                     coastal_bray_knn_permu,
                     country_bray_mst,
                     country_bray_mst_permu,
                     country_bray_knn,
                     country_bray_knn_permu,
                     batch_bray_mst,
                     batch_bray_mst_permu,
                     batch_bray_knn,
                     batch_bray_knn_permu))

}

#all samples
graph.output2 = graph_comp(scores_single_complete, nperm = 9999)

#using only P1
graph.output3 = graph_comp(scores_single_complete[scores_single_complete$aownership %in% c("P1"),], nperm=9999)

#using only P1 P2 and P3 now
profile = scores_single_complete[scores_single_complete$aownership %in% c("P1", "P2", "P3"),]
graph.output = graph_comp(profile, nperm = 9999)

graph.output.four = graph_comp(fourpeople.select, nperm = 9999)

#using only P1 at westcoast
profile.p1.westcoast = subset(scores_single_complete, aownership == "P1" & geo2 == "Westcoast")

graph.output.p1.westcoast = graph_comp(profile.p1.westcoast, nperm = 9999)

#using only P1 at campus
profile.p1.campus = subset(scores_single_complete, aownership == "P1" & location %in% c("Campus-home", "Campus", "Hiking_weekend","Hiking _weekend","Mike_office-background", "Mike_home-background", "Mike_background" ))

graph.output.p1.campus = graph_comp(profile.p1.campus, nperm = 9999)

```
#data submission form
```{r data submission dataframe}
# mandatory attributes sample_name, bioproject_accession, organism, altitude, collection_date, env_biome (air filter), env_feature (air filter), 
# env_material (air), geo_loc_name, lat_lon

#For DNA samples, including all controls
formalname.df = data.frame(formalname, oldname, Filter_No=filter_list)
formalname.df.single = formalname.df[!grepl("_2|_3", formalname.df$formalname),]
formalname.df.single = formalname.df.single[!formalname.df.single$formalname=="",]

environ.data = join(environ, formalname.df.single, by = "Filter_No")
submission.data = environ.data[,c("formalname", "oldname", "elevation.m.", "date.end", "city", "state", "country", "longitude", "latitude")]
submission.data$bioproject_accession = rep("PRJNA421162", 312)
submission.data$organism = c(rep("multiple", 283), rep("NA", 29))
submission.data$env_biome = c(rep("air filter", 283), rep("NA", 29))
submission.data$env_feature = c(rep("air filter", 283), rep("NA", 29))
submission.data$env_material = c(rep("air", 283), rep("NA", 29))
#for location
submission.data$geo_loc_name = paste0(submission.data$country,"_", submission.data$state, "_", submission.data$city)
#for lat_lon
submission.data$lat_lon = paste0(submission.data$latitude, " ", submission.data$longitude)
colnames(submission.data)

#
write_csv(submission.data[, c(1, 10, 11, 3, 4, 12, 13, 14, 15, 16, 2)], "~/Documents/Expo_paper/data_submission/DNAsamples.csv")

write_csv(submission.data[, c(1, 10, 11, 3, 4, 12, 13, 14, 15, 16, 2)], "~/Documents/Expo_paper/data_submission/RNAsamples.csv")
```
#Spatial analysis basic
```{r spatial analysis}
library(ape)
tr100 = seq(1:100)
tr100.do = dist(tr100)
tr100.doI = 1/tr100.do
tr100.d = dist(tr100)
thresh = 1
#truncation to threshold 1
tr100.d[tr100.d > thresh] = 4 * thresh
#PCoA on truncated matrix
tr100.PCoA = cmdscale(tr100.d, eig=TRUE, k = length(tr100)-1)
nb.ev = length(which(tr100.PCoA$eig > 0.0000001))
tr100.PCNM = tr100.PCoA$points[, 1:nb.ev] # tr100.PCoA$points, each column is a PCoA component; each row is a sample
#plot some PCNM variable smodeeling positive spatial correlation
par(mfrow = c(4,2))
somePCNM = c(1:50)
for (i in somePCNM){
  plot(tr100.PCNM[,i], main = paste0(i, " PCNM"))
}


#testing Moran's I
somePCNM = 1:67
observed = c()
expected = c()
pvalue = c()
for (i in 1:length(somePCNM)){
  plot(tr100.PCNM[, somePCNM[i]], type="l", ylab = c("PCNM", somePCNM[i]))
  print(paste0("factor ",  i))
  I = Moran.I(tr100.PCNM[, somePCNM[i]], as.matrix(tr100.doI))
  observed = c(observed, I$observed)
  expected = c(expected, I$expected)
  pvalue = c(pvalue, I$p.value)
}
plot(observed)
plot(expected)
plot(p.adjust(pvalue, method = "fdr"))
plot(p.adjust(pvalue, method = "fdr")-0.05)
#based the weight matrix I used, there are only 34 PCNM vairables with a Moran's I larger than E(I)
sum(observed - expected > 0 & p.adjust(pvalue, method = "fdr")<0.05) # only 27 of them have an adjusted p value less than 0.05. So these are the selected PCNM variables. They model positive spatial correlation

# two dimensional sampling: equispaced grid, this is typical for actual distances on map
# The truncation distance is et to 1. could also be chosen to be the diagonal distance within a small square of 4 points which is sqrt(2)

xygrid2 = expand.grid(1:20, 1:20) #creates a data matrix with 400 points to describe a "plane", all equally spaced to each other
xygrid2.d = dist(xygrid2)
thresh = 1
#trucation to threshold
xygrid2.d[xygrid2.d > thresh] = 4 * thresh
#PCoA on truncated matrix
xygrid2.pcoa = cmdscale(xygrid2.d, eig=TRUE, k=nrow(xygrid2)-1) #k is the number of componet retained default to 2
#count how many PCNM variables are positive
nb.ev2 = sum(xygrid2.pcoa$eig > 0.0000001) #the number of PCNM variables that are bigger than 0

xygrid2.PCNM = xygrid2.pcoa$points[, 1:nb.ev2]#matrix of PCNM variables (principal component equivalent), columns are PCNM variables, rows are samples (Sites)

#calculating Moran.I for each PCNM varaibles against distance weight matrix (using inverse of distance matrix). There are other ways of getting distance weight
#testing Moran's I
somePCNM = 1:nb.ev2
observed = c()
expected = c()
pvalue = c()
for (i in 1:length(somePCNM)){
  s.value(xygrid2, xygrid2.PCNM[, somePCNM[i]], method = "greylevel", csize=0.35, sub=somePCNM[i], csub=2)
  print(paste0("factor ",  i))
  I = Moran.I(xygrid2.PCNM[, somePCNM[i]], as.matrix(1/xygrid2.d))
  observed = c(observed, I$observed)
  expected = c(expected, I$expected)
  pvalue = c(pvalue, I$p.value)
}

sum(observed - expected > 0 & p.adjust(pvalue, method = "fdr")<0.05) #Moran's I larger than E(I) and also adjusted.p <0.05,  171 PCNM variables 

#using vegan's function and more sophisticated approach to find the distance threhsold
xygrid2.pcnm.vegan = pcnm(dist(xygrid2)) #$weights is the distance weights matrix $threshold is the distance threshold
# $values are the eigen values $vectors are the PCNM variables. 
#see above for manual caclutating of PCNM variables. 
xygrid2.pcnm.pos = xygrid2.pcnm.vegan$vectors

somePCNM = 1:ncol(xygrid2.pcnm.pos)
observed = c()
expected = c()
pvalue = c()
for (i in 1:length(somePCNM)){
  s.value(xygrid2, xygrid2.pcnm.pos[, somePCNM[i]], method = "greylevel", csize=0.35, sub=somePCNM[i], csub=2)
  print(paste0("factor ",  i))
  I = Moran.I(xygrid2.pcnm.pos[, somePCNM[i]], as.matrix(1/xygrid2.d))
  observed = c(observed, I$observed)
  expected = c(expected, I$expected)
  pvalue = c(pvalue, I$p.value)
}

sum(observed - expected > 0 & p.adjust(pvalue, method = "fdr")<0.05) #Moran's I larger than E(I) and also adjusted.p <0.05,   #171 positively correlated PCNM variables, and significant

#how to use moran.I function
ozone <- read.table("https://stats.idre.ucla.edu/stat/r/faq/ozone.csv", sep=",", header=T)
head(ozone, n=10)
ozone.dists <- as.matrix(dist(cbind(ozone$Lon, ozone$Lat)))

ozone.dists.inv <- 1/ozone.dists
diag(ozone.dists.inv) <- 0
 
ozone.dists.inv[1:5, 1:5]

tempI = Moran.I(ozone$Av8top, ozone.dists.inv)
```


##Spatial analysis tutorial PCNM approach
```{r spatial tutorial}
library(adespatial)
data(mite)
data(mite.env)
data(mite.pcnm)
data(mite.xy)
mite.h = decostand(mite, "hellinger")
mite.xy.c = scale(mite.xy, center=TRUE, scale=FALSE)
mite.h.det = resid(lm(as.matrix(mite.h) ~ ., data=mite.xy))
#mite.h.D1 = dist(mite.h.det)
mite.h.D1 = vegdist(mite.h.det,method = "euclidean")
mite.correlog = mantel.correlog(mite.h.D1, XY = mite.xy, nperm=99)
summary(mite.correlog)
plot(mite.correlog)
#number of classes
mite.correlog$n.class
#break points, for distance classes
mite.correlog$break.pts
#number of tested needed for fdr adjustements
mite.correlog$n.tests

mite.pcnm = vegan::pcnm(dist(mite.xy))
mite.pcnm.pos = mite.pcnm$vectors #vectors are automatically with plus eigenvalues
#get the pcnm variables with significant positive spatial correlation
somePCNM = 1:ncol(mite.pcnm.pos)
observed = c()
expected = c()
pvalue = c()
for (i in 1:length(somePCNM)){
  #s.value(dist(mite.xy), mite.pcnm.pos[, somePCNM[i]], method = "greylevel", csize=0.35, sub=somePCNM[i], csub=2)
  print(paste0("factor ",  i))
  I = Moran.I(mite.pcnm.pos[, somePCNM[i]], as.matrix(1/dist(mite.xy)))
  observed = c(observed, I$observed)
  expected = c(expected, I$expected)
  pvalue = c(pvalue, I$p.value)
}
sum(observed - expected > 0)
sum(observed - expected > 0 & p.adjust(pvalue, method = "fdr")<0.05)
mite.pcnm.pos.select = mite.pcnm.pos[, ((observed - expected) > 0 & p.adjust(pvalue, method = "fdr")< 0.05)]


#variation partitioning using XY cooordinates, PCNM variables, and environmental variables

#1 forward selection of coordinates
mite.XY.rda = rda(mite.h, mite.xy)
anova.cca(mite.XY.rda) #model is significant, so forward selection is carried out
mite.XY.R2a = RsquareAdj(mite.XY.rda)$adj.r.squared #any new model can't be better than full model
mite.XY.fwd = forward.sel(mite.h, as.matrix(mite.xy), adjR2thresh = mite.XY.R2a) #there are only two variables x,y, and the function selects y!
XY.sign = sort(mite.XY.fwd$order) #there is only one vairable here
XY.red = mite.xy[, c(XY.sign)] #retained XY data


#2 forward selection of environmental variables
substrate = model.matrix(~mite.env[,3])[,-1] #substrate recoding
shrubs = model.matrix(~mite.env[,4])[,-1] #shrubs recoding
topc = model.matrix(~mite.env[,5])[,-1] #topc
mite.env2 = cbind.data.frame(mite.env[, 1:2], substrate, shrubs, topc)

#forward selection of the enviromental variables
mite.env.rda = rda(mite.h, mite.env2) #using the full model, to get the maximum Rsqare
mite.env.R2a = RsquareAdj(mite.env.rda)$adj.r.squared #then get the adjusted Rsquare
mite.env.fwd = forward.sel(mite.h, mite.env2, adjR2thresh = mite.env.R2a, nperm = 9999) #forward selection on enviromental parameters, using adjusted Rsquare as the max rsqare possible (can't be better than original model)
env.sign = sort(mite.env.fwd$order)
env.red = mite.env2[, c(env.sign)] #retained env data
colnames(env.red) #retained variable after forward seleciton

#3 pncm variables
mite.undet.pcnm.rda = rda(mite.h, mite.pcnm.pos.select)
anova.cca(mite.undet.pcnm.rda)
mite.undet.pcnm.R2a = RsquareAdj(mite.undet.pcnm.rda)$adj.r.squared
mite.undet.pcnm.fwd = forward.sel(mite.h, mite.pcnm.pos.select, adjR2thresh = mite.undet.pcnm.R2a)
nrow(mite.undet.pcnm.fwd) #number of selected pcnm variables
pcnm.sign = sort(mite.undet.pcnm.fwd$order)
pcnm.red = mite.pcnm.pos[, c(pcnm.sign)] #retained pcnm data

#visualizing retained pcnm variables to see which are broad, which are fine
for (i in 1:ncol(pcnm.red)){
  s.value(mite.xy, pcnm.red[,i], sub=pcnm.sign[i], csub=2)
}

#4 assigning pcnm scale based on visualization
pcnm.broad = pcnm.red[, 1:5]
pcnm.fine = pcnm.red[, 6:10]

#5 mite-enviroment - XY coordinates - PCNM vairation partitioning
mite.varpart = varpart(mite.h, env.red, XY.red, pcnm.broad, pcnm.fine) # four is the maximum group
par(mfrow=c(1,2))
showvarparts(4)
plot(mite.varpart, digits=2)

#tests of the unique fractions [a], [b], [c], [d]
#**********************
#the cbind() function is to create the data.frame for conditioned variables (partialled out)
#fraction [a] pure environmental
anova.cca(rda(mite.h, env.red, cbind(XY.red, pcnm.broad, pcnm.fine)))
#fraction [b] pure tred
anova.cca(rda(mite.h, XY.red, cbind(env.red, pcnm.broad, pcnm.fine)))
#fraction [c] pure broad scale spatial
anova.cca(rda(mite.h, pcnm.broad, cbind(env.red, XY.red, pcnm.fine)))
#fraction [d] pure fine scale spatial
anova.cca(rda(mite.h, pcnm.fine, cbind(env.red, XY.red, pcnm.broad)))

#although tutorial mentions only a and c are significant, my results show they are all significant, the difference may be how I selected the positived correlated and significant pcnm variables (didn't use PCNM package)
#whether or not each group is significant can also be seen investigating the varpart result directly. (mite.varpart)

```
##Spatial analysis MEM
```{r spatial analysis MEM}
library(spacemakeR)
library(spdep)
library(adespatial)
library(ape)
data(mite)
data(mite.env)
data(mite.pcnm)
data(mite.xy)
mite.del = tri2nb(mite.xy)
#select the best model based on AICc value
mite.del.res = test.W(mite.h.det, mite.del)
#  AICc NbVar
#1 -94.18844     7
#summary of the best model
summary(mite.del.res$best)
R2.del = mite.del.res$best$R2[which.min(mite.del.res$best$AICc)] #best model is with the lowest AICc
#adjusted R^2 for the best model (n=70, and m =7)
RsquareAdj(R2.del, 70, 7)
#0.1989829

#second class of models
#weighting function
f2 = function(D, dmax, y)(1 - (D/dmax)^y)  #more complicated distance function
max.d = max(unlist(nbdists(mite.del, as.matrix(mite.xy))))
#power is set from 2 to 10
mite.del.f2 = test.W(mite.h.det, mite.del, f=f2, y=2:10, dmax=max.d, xy=as.matrix(mite.xy))
#AICc for the null model: -87.47112034786 
#Best spatial model:
#  y dmax      AICc NbVar
#1 2    3 -95.44519     6
#unadjusted R^2 for the best model in the second class
R2.delW = mite.del.f2$best$R2[which.min(mite.del.f2$best$AICc)]
RsquareAdj(R2.delW, 70, 6)
#0.1969499


#Third class of models. connectivity matrix based on a distance (raidus around points)
mite.vario = variogmultiv(mite.h.det, mite.xy, nclass=20)
plot(mite.vario$d, mite.vario$var, ty='b', pch=20, xlab="Distance", ylab="C(distance)")
#construction of 10 neighborhood matrices, using first peak of mite.vario plot
thresh10 = seq(give.thresh(dist(mite.xy)), 4, le = 10) #give.thresh(dist(mite.xy)) gives the minimum threshold distance needed for all sites to be connected, anything aboves this ensures it happens. le=10 means seq() outputs 10 values from the range

#create 10 model matrices
list10nb = lapply(thresh10, dnearneigh, x=as.matrix(mite.xy), d1=0)

#display an excerpt of the first neighbourhood matrix
print(listw2mat(nb2listw(list10nb[[1]]))[1:10, 1:10], digits =1)

#testing on all 10 model matrices
mite.thresh.res = lapply(list10nb, test.W, Y=mite.h.det)
mite.thresh.minAIC = sapply(mite.thresh.res, function(x){ min(x$best$AICc, na.rm=TRUE)})
min(mite.thresh.minAIC) #lowest AICc
thresh10[which.min(mite.thresh.minAIC)] #new distance threshold to use
#model selected is with 5 MEM variables, with a AICc of -100.6

#Fourth class, same as third clas, but change distance function to f2 (see above for f2 definition), this produced the best model
mite.thresh.f2 = lapply(list10nb, function(x) {test.W(x, Y=mite.h.det, f=f2, y=2:10, dmax=max(unlist(nbdists(x, as.matrix(mite.xy)))), xy=as.matrix(mite.xy))})
mite.f2.minAIC = sapply(mite.thresh.f2, function(x) min(x$best$AICc, na.rm=TRUE))
min(mite.f2.minAIC) # best model
nb.bestmod = which.min(mite.f2.minAIC) #6 MEM variables
dmax.best = mite.thresh.f2[nb.bestmod][[1]]$all[1,2] #2.668333 is the distance threshold chosen


#extract the best model
mite.MEM.champ = unlist(mite.thresh.f2[which.min(mite.f2.minAIC)], recursive = FALSE)
summary(mite.MEM.champ)

mite.MEM.champ$best$values #eigenvalues
mite.MEM.champ$best$ord #order of MEM variables by added R2

MEMid = mite.MEM.champ$best$ord[1:which.min(mite.MEM.champ$best$AICc)]
sort(MEMid)
MEM.all = mite.MEM.champ$best$vectors
MEM.select = mite.MEM.champ$best$vectors[, sort(c(MEMid))]
#unadjusted R2 of the best model
R2.MEMbest = mite.MEM.champ$best$R2[which.min(mite.MEM.champ$best$AICc)]

#adkisted R2 pf best ,pde;
RsquareAdj(R2.MEMbest, nrow(mite.h.det), length(MEMid))
#0.2906696, much higher than before
#Maps of the 7 significant MEM variables
for(i in 1:ncol(MEM.select)){
  s.value(mite.xy, MEM.select[,i], sub=sort(MEMid)[i], csub=2)}
#RDA of the mite data constratined by the 7 MEM vraibles

mite.MEM.rda = rda(mite.h.det~., as.data.frame(MEM.select))
mite.MEM.R2a = RsquareAdj(mite.MEM.rda)$adj.r.squared
anova.cca(mite.MEM.rda)
axes.MEM.test = anova.cca(mite.MEM.rda, by="axis") #axis is MEM variable here
#number of significant axes
nb.ax = length(which(maxes.MEM.test[,5] <= 0.05))

#plot maps of the two significant canonical axes
mite.MEM.axes = scores(mite.MEM.rda, choices = c(1,2), display="lc", scaling=1)
par(mfrow=c(1,2))
s.value(mite.xy, mite.MEM.axes[,1])
s.value(mite.xy, mite.MEM.axes[,2])
```

#multicomparison using KW or Friedman tests and the following pair-wise ad-hoc tests
```{r multicomparison}
library(PMCMRplus)
data(InsectSprays)
kruskal.test(count ~ spray, data=InsectSprays)
#posthoc.kruskal.nemenyi.test(count ~ spray, dist="Tukey", data = InsectSprays, p.adjust.method = "fdr")
posthoc.kruskal.dunn.test(count ~ spray, dist="Tukey", data = InsectSprays, p.adjust.method = "fdr") #this one is standard adhoc after KW test
posthoc.kruskal.conover.test(count ~ spray, dist="Tukey", data = InsectSprays, p.adjust.method = "fdr") #this one is the high power method adhoc test after KW test
```

# MISC stuff
```{r misc}
temp = read.table("~/Documents/Bioinfo/places/median_of_all_Mike_data_sickdays_weather_events.txt", stringsAsFactors = FALSE)
write_csv(temp, "~/Documents/Bioinfo/places/Mikemetadata.csv")
#color checking functions

#default colors, for barplots colors
k = ggplot(data.frame(group=c(1:6), value=c(1:6)), aes(x=group, y=value, color=as.factor(group))) + geom_point(size=3)
k
ggplot_build(k)$data
ggplot_build(k)$data[[1]]$colour

#npg colors
k = ggplot(data.frame(group=c(1:10), value=c(1:10)), aes(x=group, y=value, color=as.factor(group))) + geom_point(size=3) + scale_color_npg()
k
ggplot_build(k)$data
ggplot_build(k)$data[[1]]$colour

#nejm colors
k = ggplot(data.frame(group=1:8, value=1:8), aes(x=group, y=value, color=as.factor(group))) + geom_point(size=3) + scale_color_nejm()
print(k)
ggplot_build(k)$data
ggplot_build(k)$data[[1]]$colour

#lancet colors
k = ggplot(data.frame(group=1:8, value=1:8), aes(x=group, y=value, color=as.factor(group))) + geom_point(size=3) + scale_color_lancet()
print(k)
ggplot_build(k)$data
ggplot_build(k)$data[[1]]$colour


master_master.numeric = Filter(is.numeric, master_master)
master_master.numeric = as.matrix(master_master.numeric)

#total correlation
res2 = rcorr(master_master.numeric)
res2$r[is.na(res2$r)] = 0 #this get rids of the no correlation problems !!!
corrplot::corrplot(res2$r, order = "hclust", p.mat = res2$P, sig.level = 0.05, insig = "blank", tl.cex=0.6, type="lower")
corrplot::corrplot(res2$r, p.mat = res2$P, sig.level = 0.05, insig = "blank", tl.cex=0.6, type="lower")


#mean -variance relationship for all taxons
mv.df = data.frame(mean=apply(master.asinh.cpm.single, 1, mean), variance=apply(master.asinh.cpm.single, 1, var))
ggscatter(mv.df, x = "mean", y = "variance",
   color = "black", shape = 21, size = 4, fill="light grey", # Points color, shape and size
   add = "reg.line",  # Add regressin line
   add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
   conf.int = TRUE, # Add confidence interval
   cor.coef = TRUE, # Add correlation coefficient
   title = paste0(mole.type, "-", taxonlevel, " mean-variance relationship")
   )


  output.df = data.frame(ownership = environ.no3$ownership, date.start = environ.no3$date.start, date.end = environ.no3$date.end, location = environ.no3$location)
write.csv(output.df, "~/Documents/Bioinfo/p3.csv", quote = FALSE, row.names =FALSE)

co.DNA.species = read.csv("~/Documents/Bioinfo/localanalysis/DNA_mega.SPECIES.csv", header = TRUE, stringsAsFactors = FALSE)
#assuming DNA species data is loaded, this checks how many DNA species are also identified in co-assemble approach. 100% !! ALL OF THEM CAN BE FOUND
sum(row.names(master.asinh.cpm) %in% co.DNA.species$taxon)


#find more than distinct 20 colors
n <- 25
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
pie(rep(1,n), col=sample(col_vector, n))

#find reads number for DNA and RNA
dnareads = read.table("~/Documents/Expo_paper/DNAreads.txt", header=FALSE)
rnareads = read.table("~/Documents/Expo_paper/RNAreads.txt", header=FALSE)
median(dnareads$V1[dnareads$V1 >= 20000000])#some testing samples included
median(rnareads$V1[rnareads$V1 >= 20000000])
sum(as.numeric(dnareads[dnareads>=20000000,]))
sum(as.numeric(rnareads[rnareads>=20000000,]))
sum(as.numeric(dnareads[dnareads>=20000000,])) * 150
sum(as.numeric(rnareads[rnareads>=20000000,])) * 150


reads.df = data.frame(count = as.numeric(c(dnareads$V1[dnareads$V1>=20000000], rnareads$V1[rnareads$V1>=20000000])), type = c(rep("DNA", length(dnareads$V1[dnareads$V1>=20000000])), rep("RNA", length(rnareads$V1[rnareads$V1>=20000000])))) 

ggdensity(reads.df, x = "count",
   add = "median", 
   #add.params=list(color="red"),
   rug = TRUE,
   color = "type", 
   fill = "type",
   palette = c("#00AFBB", "#E7B800", "#3C5488FF"),
   ) + labs(x="Reads counts per sample") + theme_pubr(base_size = 16) 


#RNA amplification result
RNAamp = read.csv("~/Documents/Expo_paper/RNAamp_result.csv", header = TRUE, stringsAsFactors = FALSE)
colnames(RNAamp) = c("Filter", "Concentration", "Type")
RNAamp$Type = factor(RNAamp$Type, levels = c("Control", "Sample"))

plott = ggplot(RNAamp, aes(x=Type, y=log(Concentration), fill = Type, group = Type)) + geom_boxplot() + geom_beeswarm(color = "black", size = 0.5, alpha = 0.7) + theme_pubr(base_size = 14) + stat_compare_means()

environ$batchinfo = paste0("B",environ$batch)
for (item in unique(environ$batchinfo)){
  print(item)
  temp.mean = environ$cDNA_conc[environ$batchinfo == item & environ$Filter_No != item]
  print(mean(temp.mean, na.rm=TRUE))
  temp.blank = environ$cDNA_conc[environ$Filter_No == item]
  print(temp.blank)
  print(mean(temp.mean, na.rm=TRUE)/temp.blank)
}

#quickling getting specific samples
which(scores_single_complete$aownership == "P2")
#and how many
length(which(scores_single_complete$aownership == "P2"))

```