--- title: "Nucleotide_analysis" author: "Chao Jiang" output: html_document: default pdf_document: default --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, cache = TRUE, autodep = TRUE) knitr::opts_knit$set(root.dir = "~/Documents/Bioinfo/DNAformal") #asinh ``` extract taxon information for each sample and generate a table standard format for all analyese except for heatmap is to list rows as samples, and columns as taxons Load packages. #Packages loading============= ```{r warning=FALSE, message=FALSE} source("http://bioconductor.org/biocLite.R") library(ggplot2) library(reshape2) library(edgeR) require(NMF) require(RColorBrewer) library(plot3D) library(scales) library(Hmisc) library(ggrepel) library(vegan) library(plotly) library(cluster) library(factoextra) library(NbClust) library(plyr) library(dplyr) library(psych) library(glmnet) library(devtools) library(ggpubr) library(tidyverse) library(ade4) library(caret) library(e1071) library(pROC) library(gridExtra) library(ggsci) library(ggbeeswarm) library(ggpmisc) library(ggmap) library(colorspace) library(corrplot) ``` Input data, should be noted that the $taxonlevel$ variable here serves as the master control of entire analysis. etc. the level of analysis. ##dataloading================= ```{r dataloading} options(stringsAsFactors = FALSE) #==================================================== #this is the master control for individual analysis runs at different taxonomy level mole.type = "DNA" taxonlevel = "GENUS" setwd(paste0("~/Documents/Bioinfo/", mole.type, "formal")) file.dir = paste0(getwd(),"/tax/") #last slash is crucial #==================================================== all.found.taxons = c() sample.ids = c() #getting rownames and colnames from the datasets #taxconomy parsing function taxonomy_parse = function(taxonlevel) { if (taxonlevel == "SUPERKINGDOM"){# csv.files = dir(path = file.dir, pattern = "*.SUPERKINGDOM.csv") for(f in csv.files){ table = read.csv(paste0(file.dir, f),stringsAsFactors=F,fill=TRUE) all.found.taxons = unique(c(all.found.taxons,table$taxon)) sample.ids = unique(c(sample.ids,strsplit(f, "[.]")[[1]][1])) } csv.files = dir(path = file.dir, pattern = "*.KINGDOM.csv") for(f in csv.files){ table = read.csv(paste0(file.dir, f),stringsAsFactors=F, fill=TRUE) all.found.taxons = unique(c(all.found.taxons,table$taxon)) sample.ids = unique(c(sample.ids,strsplit(f, "[.]")[[1]][1])) } #======================initiate matrix master.m = matrix(0,nrow=length(all.found.taxons),ncol=length(sample.ids)) frag.m = matrix(0, nrow=length(all.found.taxons), ncol=length(sample.ids)) rownames(master.m) = all.found.taxons colnames(master.m) = sample.ids rownames(frag.m) = all.found.taxons colnames(frag.m) = sample.ids csv.files = dir(path = file.dir, pattern = "*.SUPERKINGDOM.csv") for(f in csv.files){ in.df = read.csv(paste0(file.dir, f),stringsAsFactors=F) sampleID = strsplit(f, "[.]")[[1]][1] master.m[in.df$taxon,sampleID] = in.df$base_num frag.m[in.df$taxon, sampleID] = in.df$frag_num } csv.files = dir(path = file.dir, pattern = "*.KINGDOM.csv") for(f in csv.files){ in.df = read.csv(paste0(file.dir, f),stringsAsFactors=F) sampleID = strsplit(f, "[.]")[[1]][1] master.m[in.df$taxon,sampleID] = in.df$base_num frag.m[in.df$taxon, sampleID] = in.df$frag_num } }# if (taxonlevel != "SUPERKINGDOM"){# csv.files = dir(path = file.dir, pattern = paste0("*.",taxonlevel,".csv")) for(f in csv.files){ table = read.csv(paste0(file.dir, f),stringsAsFactors=F) all.found.taxons = unique(c(all.found.taxons,table$taxon)) sample.ids = unique(c(sample.ids,strsplit(f, "[.]")[[1]][1])) } #===========initiate matrix master.m = matrix(0,nrow=length(all.found.taxons),ncol=length(sample.ids)) frag.m = matrix(0, nrow=length(all.found.taxons), ncol=length(sample.ids)) rownames(master.m) = all.found.taxons colnames(master.m) = sample.ids rownames(frag.m) = all.found.taxons colnames(frag.m) = sample.ids #Getting actual base_num from the datasets #Superkingdom first, kingdom second, because kingdom is a subset of superkingdom csv.files = dir(path = file.dir, pattern = paste0("*.",taxonlevel,".csv")) for(f in csv.files){ in.df = read.csv(paste0(file.dir, f),stringsAsFactors=F) sampleID = strsplit(f, "[.]")[[1]][1] master.m[in.df$taxon,sampleID] = in.df$base_num frag.m[in.df$taxon, sampleID] = in.df$frag_num } }# return(dfs = list("master.m" =master.m, "frag.m" = frag.m)) } dfs = taxonomy_parse(taxonlevel) master.m = dfs[["master.m"]] frag.m = dfs[["frag.m"]] #sort(colSums(master.m)/median(colSums(master.m))) #detecting samples with really low reads number ``` ##normalization================ ```{r normalization} #clean super rare tags, for kingdom/super kingdom only. master.m = dfs[["master.m"]] frag.m = dfs[["frag.m"]] master.ori = master.m if (FALSE){ master.m = master.ori #reset master.m } if (taxonlevel == "SUPERKINGDOM"){ master.m = master.m[c('unclassified', 'Bacteria', 'Viruses', 'Archaea', 'Fungi', 'Viridiplantae', 'Metazoa'),] } master.m =master.m/150 #average read length, divided by total number of bases, this is only symbolic master.m =master.m[order(rowSums(master.m), decreasing=TRUE),] #Reorder master.m for better data parsing frag.m = frag.m[order(rowSums(frag.m), decreasing=TRUE),] #sum(frag.m["Annelida", ])/sum(frag.m["Annelida",] >= 1)# testing average contigs number for samples with >= 1 contig #filters that demands species has to have at least one sample with more than 3 fragments, total > 100 reads, for at least one sample with more than 10 reads #these filters are really only applicable for genus and species level data if (taxonlevel != "VIRUSES"){ master.m = master.m[rowSums(master.m)>=100, ] master.m = master.m[rowSums(frag.m[rownames(master.m),] >= 3) >= 1, ] } head(rowSums(master.m), 50) if (taxonlevel == "VIRUSES" & mole.type == "DNA"){ master.m = master.m[rowSums(master.m)>=100, ] DNA.virus.names = row.names(master.m) } else if (taxonlevel =="VIRUSES" & mole.type == "RNA"){ master.m = master.m[rowSums(master.m)>=100, ] RNA.virus.names = row.names(master.m) } #virus.names = unique(c(DNA.virus.names, RNA.virus.names)) #master.m = master.m[rowSums(master.m >= 10) >= 1, ] #master.m.dgelist$counts is master.m being inputted master.m.dgelist = DGEList(master.m) #head(master.m.dgelist$samples) #head(master.m.dgelist$counts) #The difference between master.tmm and master.m.dgelist is that master.tmm has actual normalization factors master.tmm = calcNormFactors(master.m.dgelist, method = "TMM") #At this point = master.tmm$samples$lib.size == colSums(master.m) #At this point = master.tmm$counts = master.m ##I am not using normalized.lib.sizes here because TMM does not work well for our datasets , assumptions do not apply #master.cpm = cpm(master.tmm, normalized.lib.sizes = T, log=FALSE, prior.count=1) #I am using prior.count=1 because this gets rid of the negative numbers after normalization, which is a pain for barplots master.cpm = cpm(master.tmm, normalized.lib.sizes= FALSE, log=FALSE, prior.count=1) master.cpm =master.cpm[order(rowSums(master.cpm), decreasing=TRUE),] #master.cpm.log = cpm(master.tmm, normalized.lib.sizes = T, log=TRUE, prior.count=1) master.cpm.log = cpm(master.tmm, normalized.lib.sizes = FALSE, log=TRUE, prior.count=1) master.cpm.ori = master.cpm master.cpm.log.ori = master.cpm.log #asinh transformation is advantageious to log transformation and easier than chi-square transformation. master.asinh.cpm = asinh(master.cpm) master.chisquare.cpm = decostand(t(master.cpm), "chi.square") #at this point, the dataframe includes all controls and werid samples master.chisquare.cpm = master.chisquare.cpm[,-which(rownames(master.cpm)=='unclassified')] #further normalization for tSNE #remove unclassified AFTER normalization, this step is definitely subjected to debate, however here unclassified cant be treated as a taxon rank master.cpm = master.cpm[-which(rownames(master.cpm)=='unclassified'),] #at this point, the dataframe includes all controls and werid samples master.cpm.log = master.cpm.log[-which(rownames(master.cpm.log)=='unclassified'),] #master.cpm.log[master.cpm.log <=0] = 0 #this is to remove negative log number (artifacts of taking log) #species abundance in each sample speciesabundances = apply(master.asinh.cpm, 2, function(x){sum(x>0)}) #hist(speciesabundances) min(speciesabundances) max(speciesabundances) speciesabundances[which.min(speciesabundances)] speciesabundances[which.max(speciesabundances)] if(mole.type == "DNA"){ speciesabun.DNA = speciesabundances taxonabundance.DNA = rowSums(master.cpm) zerofraction.DNA = apply(master.cpm, 1, function(x) sum(x == 0))/ncol(master.cpm) mean.DNA = apply(master.cpm, 1, mean) var.DNA = apply(master.cpm, 1, var) } if (mole.type == "RNA"){ speciesabun.RNA = speciesabundances taxonabundance.RNA = rowSums(master.cpm) zerofraction.RNA = apply(master.cpm, 1, function(x) sum(x == 0))/ncol(master.cpm) mean.RNA = apply(master.cpm, 1, mean) var.RNA = apply(master.cpm, 1, var) } #how many 0s for each taxon hist(master.m[3,]) zerofraction = apply(master.m, 1, function(x) sum(x == 0))/ncol(master.m) hist(zerofraction, xlab="proportion of 0s per taxon", main="zero-inflated taxon abundance data") ggplot(data.frame(zerofraction=zerofraction), aes(x = zerofraction)) + geom_histogram() + xlab("proportions of 0s per taxon") + ggtitle(paste0(mole.type, "-", taxonlevel)) + theme_pubr() if(FALSE){ combined.species.df = data.frame(Richness = c(speciesabun.DNA, speciesabun.RNA), Type = c(rep("DNA", length(speciesabun.DNA)), rep("RNA", length(speciesabun.RNA)))) ggdensity(combined.species.df, x = "Richness", #add = "median", #add.params=list(color="red"), rug = TRUE, color = "Type", fill = "Type", palette = c("#00AFBB", "#E7B800"), ) + labs(x=paste0(tolower(taxonlevel), " richness")) + theme_pubr(base_size = 16) combined.abundance.df = data.frame(Abundance = c(log(taxonabundance.DNA), log(taxonabundance.RNA)), Type = c(rep("DNA", length(taxonabundance.DNA)), rep("RNA", length(taxonabundance.RNA)))) ggdensity(combined.abundance.df, x = "Abundance", add = "median", #add.params=list(color="red"), rug = TRUE, color = "Type", fill = "Type", palette = c("#00AFBB", "#E7B800"), ) + labs(x=paste0("log(Total ", tolower(taxonlevel), " abundance in CPM)")) + theme_pubr(base_size = 16) combined.zerofraction.df = data.frame(Zerofraction = c(zerofraction.DNA, zerofraction.RNA), Type = c(rep("DNA", length(zerofraction.DNA)), rep("RNA", length(zerofraction.RNA)))) ggdensity(combined.zerofraction.df, x = "Zerofraction", add = "median", #add.params=list(color="red"), rug = TRUE, color = "Type", fill = "Type", palette = c("#00AFBB", "#E7B800") ) + labs(x=paste0("Zero fractions of ", tolower(taxonlevel), " in all samples")) + theme_pubr(base_size = 16) #RNA_mean = mean.RNA, RNA_var = var.RNA, #mean variance relationship ggplot(data.frame( DNA_mean = mean.DNA, DNA_var = var.DNA), aes(x = log(DNA_mean), y = log(DNA_var))) + geom_point(shape = 21, color = "#00AFBB", fill = "#00AFBB", alpha = 0.3) + geom_point(data = data.frame(RNA_mean = mean.RNA, RNA_var = var.RNA), aes(x = log(RNA_mean), y=log(RNA_var)), shape = 21, color = "#E7B800", fill = "#E7B800", alpha = 0.3) + stat_smooth(method = "lm", formula = y ~ x, color = "#00AFBB") + stat_smooth(data = data.frame(RNA_mean = mean.RNA, RNA_var = var.RNA), aes(x = log(RNA_mean), y=log(RNA_var)), method = "lm", formula = y~x, color = "#E7B800") + labs( x = "log(mean of genus abundance (CPM) across samples)", y = "log(variance of genus abundance (CPM) acorss samples)") + theme_pubr(base_size = 11) } ggplot(data.frame(speciesabundances = speciesabundances), aes(x=speciesabundances)) + geom_histogram() + xlab("species richness") + theme_pubr() ggdensity(data = data.frame(abundance = log(rowSums(master.cpm), base=10)), x = "abundance", add = "median", color = "#00AFBB", fill = "#00AFBB", rug = TRUE, ) + labs(x=paste0("Total ", tolower(taxonlevel), " abundance in log(CPM)")) + theme_pubr(base_size = 16) ``` ##METAdata parsing========================================= THERE IS NO NEED TO RERUN THIS PART FOR EVERY ANALYSIS UNLESS ITS SWITCHING BETWEEN DNA and RNA THIS PART IS VERY COMPLICATED DUE TO HISTORICAL UPDATES ```{r metadata parsing} # Opens multiple CSV files for metadata extraction # this uses local files from the laptop sampletofilter = read.csv("/Users/jiangch/Documents/Bioinfo/places/lib-filter.csv", stringsAsFactors=F, fill = TRUE) #the index of colnames(master.m) in sampletofilter$sample.old #this is basically the names of samples translated into filter number if (mole.type == "DNA"){ filter_list = sampletofilter$filter[match(colnames(master.m),sampletofilter$sample.old)] batch = sampletofilter$batch[match(colnames(master.m), sampletofilter$sample.old)] formalname = sampletofilter$sample[match(colnames(master.m), sampletofilter$sample.old)] oldname = sampletofilter$sample.old[match(colnames(master.m), sampletofilter$sample.old)] } if (mole.type == "RNA"){ filter_list = sampletofilter$filter[match(colnames(master.m),sampletofilter$sample.xin.old)] batch = sampletofilter$batch[match(colnames(master.m), sampletofilter$sample.xin.old)] formalname = sampletofilter$sample.xin[match(colnames(master.m), sampletofilter$sample.xin.old)] oldname = sampletofilter$sample.xin.old[match(colnames(master.m), sampletofilter$sample.xin.old)] } #this is get everything filterinfo = read.csv("/Users/jiangch/Documents/Bioinfo/places/filter-informationnew_xin.csv", stringsAsFactors=F) devicemeta = read.csv("/Users/jiangch/Documents/Bioinfo/places/device_metadata.new.csv", stringsAsFactors=F) #importing RNA amplification results RNAamp = read.csv("~/Documents/Bioinfo/places/RNAamp_result.csv", header = TRUE) #importing from Lora's datasets weather = readRDS("/Users/jiangch/Documents/Bioinfo/places/weather_events.rds") #newest environ environ.recent = readRDS("~/Documents/Bioinfo/places/newest_environ.rds") #checking dates extracted from device compared to filter-information.csv #looks like filter-information is more reliable weather_matching = weather$X_Events[match(filter_list, weather$Filter_No)] sick_matching = sickfilter$remarks[match(filter_list, sickfilter$Filter_No)] #relevant filters in this dataset filterinfo.match = join(data.frame(Filter.No.=filter_list), filterinfo, by = "Filter.No.") #all fake entries, caused by blank filters #filterinfo.match[rowSums(is.na(filterinfo.match)) == 9, ] old.location = filterinfo.match$comments location = filterinfo.match$location geo = filterinfo.match$geo geo2 = filterinfo.match$geo2 #the owner of each filter ownership = filterinfo.match$Patient.ID #date related parsing date.start = filterinfo.match$filter.start date.start = as.Date(date.start, "%m/%d/%y") date.end = filterinfo.match$filter.collect date.end = as.Date(date.end, "%m/%d/%y") #Duration of each filter, this information is not as accurate as device measured duration. #duration = date.end - date.start #collected duration, this is probably the most accurate estimates devicemeta.match = join(data.frame(filter=filter_list), devicemeta) duration = devicemeta.match$duration #this information is used to date individual sample as of now. Unless better ways are found date.mid = date.start + (date.end - date.start)/2 #getting particle information from the device #There are a few filters with average particle concentration less than 0, a simple abs() is used here. potentially need more refining particle = abs(devicemeta.match$average_particle) median.particle = devicemeta.match$median_particle sd.particle = devicemeta.match$sd_particle total.particle = particle * duration #getting temperature information for each filter from the device metadata temperature = devicemeta.match$average_temp median.temperature = devicemeta.match$median_temp sd.temperature = devicemeta.match$std_temp #getting information about humidity for each filter from the device metadata humid = devicemeta.match$average_humid median.humid = devicemeta.match$median_humid sd.humid = devicemeta.match$std_humid date.month = format(date.mid, "%m") date.month = as.numeric(date.month) date.day = as.numeric(format(date.mid, "%d")) library(plyr) #Mapping to weekdays and weekends weekdays1 = c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday') #assigning weekend index the difficult and precise way # weekend = c() for (i in seq(1:length(filter_list))){ if (weekdays(date.start[i]) %in% c('Thursday', 'Friday') & weekdays(date.end[i]) %in% c('Monday', "Tuesday") & (date.end[i] - date.start[i])<=5 ) { weekend = c(weekend, "weekend") print("ok") print(c(date.start[i], weekdays(date.start[i]))) print(c(date.end[i], weekdays(date.end[i]))) print(date.end[i] - date.start[i]) } else if (is.na(date.start[i])) { weekend = c(weekend, NA) } else { weekend = c(weekend, "weekday") #print("nah") } } #this is the conventional definition of seasons in northern hemisphere season = mapvalues(as.numeric(date.month),from=as.numeric(c(seq(3,12),1,2)),to=c(rep("spring",3),rep("summer",3),rep("fall",3),rep("winter",3))) #dataframe from device meta_data = data.frame(batch,location,geo, geo2, ownership, duration, date.month, date.start, date.end, season, date.mid, particle, sd.particle, median.particle, total.particle, temperature, median.temperature, sd.temperature, humid, sd.humid, median.humid, Filter_No=filter_list, weekend) environ = join(meta_data, comc, by="Filter_No") environ = join(environ, RNAamp[,1:3], by="Filter_No") environ = environ[!duplicated(environ),] #same filter information for replicates of the same filter environ = environ[order(environ$date.mid, environ$date.start),] environ.mike = subset(environ, ownership=="Mike") #detailed information of sickfilters sickfilter = data.frame(sickfilter, date.mid = environ$date.mid[match(sickfilter$Filter_No, environ$Filter_No)], date.start = environ$STARTING_DATE[match(sickfilter$Filter_No, environ$Filter_No)]) #updating environ environ.input = readRDS("~/Documents/Bioinfo/places/new_environ.rds") environ.input = environ.input[order(as.Date(environ.input$date.mid, "%m/%d/%Y"), as.Date(environ.input$date.start, "%m/%d/%Y")),] #unify ordering environ[ ,c(27:44, 49, 50)] =environ.input[ ,c(27:44, 49, 50)] environ$X_Mean_Humidity = as.numeric(environ$X_Mean_Humidity) #hiking days environ$hiking = environ$Filter_No hikingfilters = c("2","3","5","9","13","43","58","123","124","131","146","192","201","212","326","327","332","341") environ$hiking = mapvalues(environ$hiking,from=hikingfilters,to=rep("hiking", length(hikingfilters))) environ$hiking = mapvalues(environ$hiking, from=environ$hiking[-grep("hiking|B", environ$hiking)], to=rep("non-hiking", length(environ$hiking[-grep("hiking|B", environ$hiking)]))) #coastal-inland input2 = read.csv("~/Documents/Bioinfo/places/environ-new-xin.csv", header = TRUE, stringsAsFactors = FALSE) input2 = input2[order(as.Date(input2$date.mid, "%m/%d/%Y"), as.Date(input2$date.start, "%m/%d/%Y")),]#unify ordering environ$geo3 = input2$geo3 #annonymous ownership environ$aownership = environ$ownership environ$aownership = mapvalues(environ$aownership, from = c(unique(environ$ownership)), to = c("P1", "P2", "Others", "Others", "Others", "P8", "Others", "P3", "P7", "Others", "Others", "Others", "Others","P5", "P4", "P6", "Others", NA)) #recent updates population = read.csv("~/Documents/Bioinfo/places/combined_data_New20171010.csv", stringsAsFactors = FALSE) #population$population2 = as.numeric(population$population2) #population$Population_density_people_per_sqmi = as.numeric(population$Population_density_people_per_sqmi) air = read.csv("~/Documents/Bioinfo/places/comb.csv", stringsAsFactors = FALSE) environ = join(environ, population, by="Filter_No") environ = join(environ, air, by="Filter_No") #fixes environ$is_there_rain = as.numeric(environ$is_there_rain) if (FALSE){ #total correlation environ.numeric = Filter(is.numeric, environ) environ.numeric = as.matrix(environ.numeric) res2 = rcorr(environ.numeric, type = "pearson") #takes output from rcorr function, adjusting p value matrix accroding to fdr principles matrixPadjust = function(res, select = c(1:nrow(res$P))){ print(select) temp.p = c(res$P) temp.p[is.na(temp.p)] = 0 temp.p.adj = p.adjust(temp.p, method = "fdr") p.adj.matrix = matrix(temp.p.adj, nrow = ncol(res$P), ncol=ncol(res$P), dimnames = list(rownames = colnames(res$P), colnames = colnames(res$P))) return(p.adj.matrix) } adjusted.res2.p = matrixPadjust(res2) #pls package also has a function called corrplot corrplot::corrplot(res2$r, order = "hclust", p.mat = adjusted.res2.p, sig.level = 0.01, insig = "blank", diag = FALSE, tl.cex=0.5, addrect = 17, method="color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black") } ``` ##PCA for all=================== ```{r pca analysis} #==============PCA analysis #log, no scale pca = dudi.pca(t(master.cpm), scale=FALSE, scannf=FALSE, nf=3) #pca = dudi.pca(t(Mike.master.cpm), scale=FALSE, scannf=FALSE, nf=3) fviz_screeplot(pca, addlabels = TRUE ) summary(pca) var.pca <- get_pca_var(pca) head(var.pca$contrib,30) #contributions of variables to individual PC. each column sum to 100 #this method seems to be better for presentation purpose than picking from contrib=18 pca.names = c(head(rownames(var.pca$contrib)[order(var.pca$contrib[,1], decreasing = TRUE)],6), head(rownames(var.pca$contrib)[order(var.pca$contrib[,2], decreasing = TRUE)],6),head(rownames(var.pca$contrib)[order(var.pca$contrib[,3], decreasing = TRUE)],6)) pca.names =unique(pca.names) test.names = c(head(rownames(pca$c1)[order(pca$c1$CS1)]),tail(rownames(pca$c1)[order(pca$c1$CS1)]), head(rownames(pca$c1)[order(pca$c1$CS2)]),tail(rownames(pca$c1)[order(pca$c1$CS2)]), head(rownames(pca$c1)[order(pca$c1$CS3)]),tail(rownames(pca$c1)[order(pca$c1$CS3)])) fviz_pca_var(pca, col.var="contrib", geom=c("point","text"), select.var = list(name=test.names), gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE) +coord_fixed(sqrt(pca$eig[2] / pca$eig[1]))#plot features(loadings) fviz_pca_var(pca, col.var="contrib", geom=c("point","text"), select.var = list(name=test.names), gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE, axes = c(1,3)) +coord_fixed(sqrt(pca$eig[3] / pca$eig[1]))#plot features(loadings) fviz_pca_ind(pca, label = "none", pointsize=2.0, alpha.ind = 0.5, col.ind = "contrib") +coord_fixed(sqrt(pca$eig[2] / pca$eig[1])) ``` This part is aimed at filtering data into many data frames, could consider split this part into individual analysis part ##data_filtering================= ```{r data_filtering} scores.temp = data.frame(pca$li[,1:3],Filter_No=filter_list, samplenames=rownames(pca$li)) scores = join(scores.temp, environ, by = "Filter_No") scores = scores[!duplicated(scores),] #remove duplicated rows #sort by dates scores = scores[order(scores$date.mid, scores$date.start),] #making the ordering unique #remove all filters without starting dates, so blanks and filter 140 scores_blank = scores[is.na(scores$date.mid),] #this keeps all blanks scores_blank = scores_blank[!scores_blank$Filter_No == "140",] #scores_complete now refer to all sample filters scores_complete = scores[!is.na(scores$date.start),] scores_complete$season = factor(scores_complete$season, levels=c("spring", "summer", "fall","winter")) if(mole.type == "DNA"){ scores_single_complete = scores_complete[-grep("double|I15_2|I15_3", scores_complete$samplenames, ignore.case=TRUE), ] } else { scores_single_complete = scores_complete } scores_single_complete$campus_ornot = ifelse(scores_single_complete$location %in% c("Campus-home", "Campus", "Mike_office-background", "Mike_home-background", "Mike_background" ), "Campus", "Non-campus") #Mike = scores[grep("Mike", scores$ownership, ignore.case = TRUE),] Mike_single = scores_single_complete[grep("Mike", scores_single_complete$ownership, ignore.case = TRUE),] Patient1 = scores_single_complete[grep("No1_1636-69-063", scores_single_complete$ownership,ignore.case=TRUE),] Patient3 = scores_single_complete[grep("No3_San Mateo", scores_single_complete$ownership, ignore.case=TRUE),] campushome = Mike_single[grep("campus|home|Mike|office|weekend|desk|outdoor", Mike_single$location, ignore.case = TRUE),] noncampushome = Mike_single[-grep("campus|home|Mike|office|weekend|desk|outdoor", Mike_single$location, ignore.case = TRUE),] #non-mike filters others = scores_single_complete[-grep("Mike", scores_single_complete$ownership, ignore.case=TRUE), ] #four people studies fourpeople = others[others$ownership %in% c("Gw", "No3_San Mateo", "zhixin_fremont", "Guan_SF"), ] fourpeople = fourpeople[fourpeople$date.start > as.Date("2016-11-20"), ] mike.fourpeople = Mike_single[Mike_single$date.start > as.Date("2016-11-20") & Mike_single$date.start 0) >= 50,] master.asinh.cpm.single.hundred = master.asinh.cpm.single[rowSums(master.asinh.cpm.single > 0) >= 100,] #saving for DNA-RNA correlations if (mole.type == "DNA"){ master.DNA.asinh.cpm.single = master.asinh.cpm.single master.DNA.cpm.single = master.cpm.single scores.DNA.single = scores_single_complete } if (mole.type == "RNA"){ master.RNA.asinh.cpm.single = master.asinh.cpm.single master.RNA.cpm.single = master.cpm.single scores.RNA.single = scores_single_complete } #new data frame to store Mike's samples taxonomy information in timely order Mike.master.cpm = master.cpm.single[, Mike_single$samplenames] Mike.master.cpm.log = master.cpm.single.log[, Mike_single$samplenames] Mike.master.asinh.cpm = master.asinh.cpm.single[, Mike_single$samplenames] #Mike NAmerica dataset Mike_NAmerica.master.cpm = master.cpm.single[, Mike_NAmerica$samplenames] Mike_NAmerica.master.cpm.log = master.cpm.single.log[, Mike_NAmerica$samplenames] Mike_NAmerica.master.asinh.cpm = master.asinh.cpm.single[, Mike_NAmerica$samplenames] #Mike NAnonbay dataset Mike_NAnonbay.master.cpm = master.cpm.single[, Mike_NAnonbay$samplenames] Mike_NAnonbay.master.cpm.log = master.cpm.single.log[, Mike_NAnonbay$samplenames] Mike_NAnonbay.master.asinh.cpm = master.asinh.cpm.single[, Mike_NAnonbay$samplenames] #Mike CAnonbay dataset Mike_CAnonbay.master.cpm = master.cpm.single[, Mike_CAnonbay$samplenames] Mike_CAnonbay.master.cpm.log = master.cpm.single.log[, Mike_CAnonbay$samplenames] Mike_CAnonbay.master.asinh.cpm = master.asinh.cpm.single[, Mike_CAnonbay$samplenames] #new data frame to store Mike's campushome samples taxonomy information in timely order campushome.master.cpm = master.cpm.single[, campushome$samplenames] campushome.master.cpm.log = master.cpm.single.log[, campushome$samplenames] campushome.master.asinh.cpm = master.asinh.cpm.single[, campushome$samplenames] #new data frame to store Mike's non-campushome data noncampushome.master.cpm = master.cpm.single[, noncampushome$samplenames] noncampushome.master.cpm.log = master.cpm.single.log[, noncampushome$samplenames] noncampushome.master.asinh.cpm = master.asinh.cpm.single[, noncampushome$samplenames] #patient1 patient1.master.cpm = master.cpm.single[, Patient1$samplenames] patient1.master.cpm.log = master.cpm.single.log[, Patient1$samplenames] patient1.master.asinh.cpm = master.asinh.cpm.single[, Patient1$samplenames] #patient3 patient3.master.cpm = master.cpm.single[, Patient3$samplenames] patient3.master.cpm.log = master.cpm.single.log[, Patient3$samplenames] patient3.master.asinh.cpm = master.asinh.cpm.single[, Patient3$samplenames] #others others.master.cpm = master.cpm.single[, others$samplenames] others.master.cpm.log = master.cpm.single.log[, others$samplenames] others.master.asinh.cpm = master.asinh.cpm.single[, others$samplenames] #fourpeople studies four.master.cpm = master.cpm.single[, fourpeople$samplenames] four.master.cpm.log = master.cpm.single.log[, fourpeople$samplenames] four.master.asinh.cpm = master.asinh.cpm.single[, fourpeople$samplenames] #fourpeople select fourpeople.select.master.cpm = master.cpm.single[, fourpeople.select$samplenames] fourpeople.select.master.cpm.log = master.cpm.single.log[, fourpeople.select$samplenames] fourpeople.select.master.asinh.cpm = master.asinh.cpm.single[, fourpeople.select$samplenames] #combing data with contextual variables(metadata), #should be noted that while master.cpm list samples as columns, scores_complete list samples as rows!!!! master_combined_complete = data.frame(scores_single_complete, t(master.cpm.single)) P1_combined_complete = data.frame(Mike_single, t(Mike.master.cpm)) #chisquare transformations master.chisquare.cpm.single = master.chisquare.cpm[scores_single_complete$samplenames,] patient1.chisquare.cpm = master.chisquare.cpm[Patient1$samplenames,] Mike.chisquare.cpm = master.chisquare.cpm[Mike_single$samplenames,] #reoder season levels, this is intended for ggplot legend ordering of seasons master_combined_complete$season = factor(master_combined_complete$season, levels=c("spring", "summer", "fall","winter")) #for estimation of dog. cat, guinea pig abundance sum(master.asinh.cpm.single["Canis",])/sum(master.asinh.cpm.single) *100 sum(master.asinh.cpm.single["Felis",])/sum(master.asinh.cpm.single) *100 sum(master.asinh.cpm.single["Cavia",])/sum(master.asinh.cpm.single) *100 #================================this section is for anosim analysis if(FALSE){ test.ano = rbind(Mike.chisquare.cpm, patient1.chisquare.cpm) test.group = c(scores_single_complete[row.names(Mike_single), "ownership"], scores_single_complete[row.names(Patient1), "ownership"]) test.result = anosim(test.ano, test.group) test.result } nulc.phyla.abundance = rowSums(master.asinh.cpm.single)[-1]/sum(rowSums(master.asinh.cpm.single)[-1]) for (i in 1:20){ print(i) print(sum(head(nulc.phyla.abundance, i)))} ``` #Generating timeline plot========== ```{r timeline plot} #four owners #creating sub-dataframe for Mike and P1 environ$season = factor(environ$season, levels=c("spring", "summer", "fall", "winter")) #P1 environ.mike = subset(environ, ownership=="Mike") environ.p1 = subset(environ, ownership=="No1_1636-69-063") environ.gw = subset(environ, ownership == "Gw") environ.zhixin = subset(environ, ownership == "zhixin_fremont") environ.no3 = subset(environ, ownership == "No3_San Mateo") environ.ting = subset(environ, ownership == "Wang Ting") environ.xin = subset(environ, ownership == "Xin") environ.sf = subset(environ, ownership == "Guan_SF") environ.relevant = subset(environ, ownership %in% c("Mike","No1_1636-69-063","Gw","zhixin_fremont" ,"No3_San Mateo", "Wang Ting","Xin","Guan_SF")) environ.others = subset(environ, !(ownership %in% c("Mike","No1_1636-69-063","Gw","zhixin_fremont" ,"No3_San Mateo", "Wang Ting","Xin","Guan_SF")) & !is.na(ownership)) #multiplot of timeline pointsize = 5 #control size of points color.choice = "season" start.date = as.Date("2014-06-10") #timeline timeline = ggplot(environ.mike, aes(x=date.mid, y=0, color=get(color.choice))) timeline + geom_point(data = environ.mike, shape=20, size=pointsize, alpha=0.5) + annotate("text", x = start.date, y = 0, label = "P1 n=201", color= "#00A087FF", fontface=2) + geom_point(data= environ.p1, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.1, color=get(color.choice))) + annotate("text", x = start.date, y = 0.1, label = paste0("P2 n=", dim(environ.p1)[1]), color= "#3C5488FF", fontface=2) + geom_point(data= environ.no3, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.2, color=get(color.choice))) + annotate("text", x = start.date, y = 0.2, label = paste0("P3 n=", dim(environ.no3)[1]), color ="#E64B35FF",fontface=2) + geom_point(data= environ.zhixin, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.3, color=get(color.choice))) + annotate("text", x = start.date, y = 0.3, label = paste0("P4 n=", dim(environ.zhixin)[1])) + geom_point(data= environ.gw, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.4, color=get(color.choice))) + annotate("text", x = start.date, y = 0.4, label = paste0("P5 n=", dim(environ.gw)[1])) + geom_point(data= environ.sf, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.5, color=get(color.choice))) + annotate("text", x = start.date, y = 0.5, label = paste0("P6 n=", dim(environ.sf)[1])) + geom_point(data= environ.ting, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.6, color=get(color.choice))) + annotate("text", x = start.date, y = 0.6, label = paste0("P7 n=", dim(environ.ting)[1])) + geom_point(data= environ.xin, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.7, color=get(color.choice))) + annotate("text", x = start.date, y = 0.7, label = paste0("P8 n=", dim(environ.xin)[1])) + geom_point(data= environ.others, shape=20,size=pointsize, alpha=0.5, aes(x=date.mid, y = 0.8, color=get(color.choice))) + annotate("text", x = as.Date("2014-06-25"), y = 0.8, label = paste0("Others n=", dim(environ.others)[1])) + annotate("text", x = as.Date("2014-06-20"), y = -0.1, label = paste0("Total N=", nrow(environ.others)+nrow(environ.relevant)))+ scale_x_date(limits = c(as.Date("2014-05-20"), as.Date("2017-01-01")), labels=date_format("%Y-%m"), date_breaks = "3 months") + labs(color="Season", x="Date", y = "Individual")+ ylim(c(-0.2,0.9))+ scale_color_manual(values=c("spring" = "#00A087FF", "summer" = "#4DBBD5FF", "fall" = "#E64B35FF", "winter" = "#3C5488FF")) + theme_pubr(base_size = 16, x.text.angle = 30, margin = FALSE) + theme(axis.text.y=element_blank(), axis.ticks.y=element_blank(), axis.text.x = element_text(size = 12)) #geom_hline(yintercept=-0.00) + ``` #overall PCA plots, non-normalized ```{r overal PCA plots, non-normalized} #Mike.master.asinh.cpm pca = dudi.pca(t(master.cpm.single), scale=FALSE, scannf=FALSE, nf=3) fviz_screeplot(pca, addlabels = TRUE ) summary(pca) var.pca <- get_pca_var(pca) head(var.pca$contrib,30) #contributions of variables to individual PC. each column sum to 100 #this method seems to be better for presentation purpose than picking from contrib=18 test.names = c(head(rownames(pca$c1)[order(pca$c1$CS1)]),tail(rownames(pca$c1)[order(pca$c1$CS1)]), head(rownames(pca$c1)[order(pca$c1$CS2)]),tail(rownames(pca$c1)[order(pca$c1$CS2)]), head(rownames(pca$c1)[order(pca$c1$CS3)]),tail(rownames(pca$c1)[order(pca$c1$CS3)])) test.names =unique(test.names) fviz_pca_var(pca, col.var="contrib", geom=c("point","text"), select.var = list(name=test.names), gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE) +coord_fixed(sqrt(pca$eig[2] / pca$eig[1]))#plot features(loadings) fviz_pca_var(pca, col.var="contrib", geom=c("point","text"), select.var = list(name=test.names), gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE, axes = c(1,3)) +coord_fixed(sqrt(pca$eig[3] / pca$eig[1]))#plot features(loadings) fviz_pca_ind(pca, label = "none", pointsize=2.0, alpha.ind = 0.5, col.ind = "contrib") +coord_fixed(sqrt(pca$eig[2] / pca$eig[1])) fviz_pca_ind(pca, label = "none", axes = c(1,3), pointsize=2.0, alpha.ind = 0.5, col.ind = "contrib") +coord_fixed(sqrt(pca$eig[3] / pca$eig[1])) bi.plot = fviz_pca_biplot(pca, # Individuals geom.ind = c("point"), geom.var = c("arrow", "text"), #fill.ind = ok$geo2, #col.ind = fourpeople.select$ownership, select.var= list(name = test.names), pointshape = 21, pointsize = 5, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.3 , alpha.ind = 0.7, repel = TRUE, col.var = "contrib", gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"), label.size = 5, #addEllipses = TRUE, #ellipse.type = "euclid", #ellipse.level = 0.60, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Geo", color = "Contrib", alpha = "Contrib") + coord_fixed(sqrt(pca$eig[2] / pca$eig[1])) + theme_pubr(base_size = 14) print(bi.plot) ``` ##overall DNA-RNA correlation ```{r overall DNA-RNA correlation} if (mole.type == "DNA"){ master.DNA.asinh.cpm.single = master.asinh.cpm.single master.DNA.cpm.single = master.cpm.single scores.DNA.single = scores_single_complete master.DNA.asinh.cpm.single.copy = master.asinh.cpm.single } if (mole.type == "RNA"){ master.RNA.asinh.cpm.single = master.asinh.cpm.single master.RNA.cpm.single = master.cpm.single scores.RNA.single = scores_single_complete master.RNA.asinh.cpm.single.copy = master.asinh.cpm.single } #save copies for furth uses #resetting master.DNA.asinh.cpm.single = master.DNA.asinh.cpm.single.copy master.RNA.asinh.cpm.single = master.RNA.asinh.cpm.single.copy if ( taxonlevel == "PHYLUM"){ master.DNA.asinh.cpm.select = master.DNA.asinh.cpm.single[2:21,] master.RNA.asinh.cpm.select = master.RNA.asinh.cpm.single[rownames(master.DNA.asinh.cpm.single)[2:21],] newdnanames = paste0("DNA_",rownames(master.DNA.asinh.cpm.select)) newrnanames = paste0("RNA_", rownames(master.RNA.asinh.cpm.select)) rownames(master.DNA.asinh.cpm.select) = newdnanames rownames(master.RNA.asinh.cpm.select) = newrnanames overallres = rcorr(t(master.DNA.asinh.cpm.select[, ]), t(master.RNA.asinh.cpm.select[, ]), type="spearman") m = corrplot::corrplot(overallres$r[1:20,21:40], p.mat = overallres$P[1:20,21:40], sig.level = 0.05, insig = "blank", tl.cex=0.8, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1)) m = corrplot::corrplot(overallres$r, p.mat = overallres$P, sig.level = 0.05, insig = "blank", tl.cex=0.6, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1)) } if (taxonlevel == "SUPERKINGDOM"){ master.DNA.asinh.cpm.single = master.DNA.asinh.cpm.single[order(rownames(master.DNA.asinh.cpm.single)), ] master.RNA.asinh.cpm.single = master.RNA.asinh.cpm.single[order(rownames(master.RNA.asinh.cpm.single)), ] master.DNA.asinh.cpm.single = master.DNA.asinh.cpm.single[-5,] master.RNA.asinh.cpm.single = master.RNA.asinh.cpm.single[-5,] newdnanames = paste0("DNA_",rownames(master.DNA.asinh.cpm.single)) newrnanames = paste0("RNA_", rownames(master.RNA.asinh.cpm.single)) rownames(master.DNA.asinh.cpm.single) = newdnanames rownames(master.RNA.asinh.cpm.single) = newrnanames overallres = rcorr(t(master.DNA.asinh.cpm.single[, ]), t(master.RNA.asinh.cpm.single[, ]), type="spearman") #overallres = rcorr(t(scale(master.DNA.asinh.cpm.single[, ])), scale(t(master.RNA.asinh.cpm.single[, ])), type="spearman") m = corrplot::corrplot(overallres$r[1:6,7:12], p.mat = overallres$P[1:6,7:12], sig.level = 0.05, insig = "blank", tl.cex=1, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1)) m = corrplot::corrplot(overallres$r, p.mat = overallres$P, sig.level = 0.05, insig = "blank", tl.cex=0.6, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1)) } #different way of visulization test.vector = c(overallres$r[1,8], overallres$r[2,9], overallres$r[3,10], overallres$r[4,11], overallres$r[6, 13], overallres$r[7,14]) p.vector = c(overallres$P[1,8], 1e-20, overallres$P[3,10], overallres$P[4,11], overallres$P[6, 13], overallres$P[7,14]) ggplot(data.frame(name = sort(rownames(master.asinh.cpm.single))[-5], cor = test.vector, p = p.vector), aes(x = name, y=cor, fill = name, size=-log(p, base =10))) + geom_point(shape = 21) + theme_pubr(base_size = 18)+ rremove("x.text") + rremove("xlab") + scale_fill_d3() + labs(fill = "Domain", size = "-log(p)") ``` ##correlation matrix======================correlation matrix + p values for taxon labels ```{r correlation matrix of taxons} #at least 5 samples have more than 100 CPM #for genus keep=seq(1:nrow(master.asinh.cpm.single)) if (taxonlevel == "GENUS" | taxonlevel == "SPECIES" | taxonlevel == "FAMILY" | taxonlevel == "ORDER"){ correlation.df = master.asinh.cpm.single[rowSums(master.asinh.cpm.single >0) >= 283*0.1,] correlation.df = correlation.df[apply(correlation.df, 1, var)>2,] keep = rownames(correlation.df) } else if (taxonlevel == "PHYLUM"){ correlation.df = master.asinh.cpm.single[rowSums(master.asinh.cpm.single >0) > 0,] correlation.df = correlation.df[apply(correlation.df, 1, var)>1,] keep = rownames(correlation.df) } #master.temp = master.cpm.single[keep,] master.temp = master.asinh.cpm.single[keep,] master.temp = master.temp[order(rowSums(master.temp), decreasing = TRUE),] res2 = rcorr(t(master.temp), type="pearson") p.adj.matrix2 = matrixPadjust(res2) #correlations threshold set at 0.7 #res2$r = res2$r[order(rowSums(res2$r>0.7, na.rm=TRUE), decreasing = TRUE), ] #select = rowSums(res2$r>0.7, na.rm=TRUE) > 2 #m = corrplot::corrplot(res2$r[select, select], order = "hclust", p.mat = p.adj.matrix2[select, select], sig.level = 0.05, insig = "blank", tl.cex=0.5, diag = FALSE, addrect = 5, method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1)) #clustered m = corrplot::corrplot(res2$r[keep,keep], order = "hclust", p.mat = res2$P[keep,keep], sig.level = 0.01, insig = "blank", tl.cex=0.6, diag = FALSE, addrect = 8, method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black") #not clustered m = corrplot::corrplot(res2$r[keep,keep], p.mat = res2$P[keep,keep], sig.level = 0.01, insig = "blank", tl.cex=0.5, diag = FALSE, method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black") #pdf(paste0("./pdfs/",taxonlevel,"_correlationtable.pdf")) #orrplot(res2$r, order = "hclust", p.mat = res2$P, sig.level = 0.05, insig = "blank", tl.cex=0.8) #dev.off() ``` ##formal plotting of environmental variables ```{r env plotting} ggplot(scores_single_complete, aes(x=date.month, y=scale(Mean_TemperatureC), color=date.month)) + stat_smooth(aes(group=1, color = "red")) + stat_smooth(aes(y=scale(as.numeric(X_Mean_Humidity)), color = "blue")) + stat_smooth(aes(y=scale(particle), color = "green")) + stat_smooth(aes(y=scale(duration), color = "yellow")) + theme_pubr(base_size = 16) + labs(x = "Month", y = "Scaled Values", color = "Category") + scale_x_discrete(limits=c(1:12), expand=c(0,0)) + scale_color_d3(labels = c("Humidity", "Particle", "Temperature", "Duration")) ``` #PCA plot by seasons for small number of samples ```{r PCA by seasons etc} #define function for plotting pcasnapshot = function(ok, text.size=4, basesize =14){ colfunc<-colorRampPalette(c("red","yellow","springgreen","royalblue")) #print(str(x)) #print(ok$location) # print(ok[, "geo2"]) #print(class(ok)) #this is only for testing ok = scores.snap #text.size = 2 #ok = scores.snap ok = data.frame(ok) master.cpm.snap = master.cpm.single[, ok$samplenames] pcares = dudi.pca(t(asinh(master.cpm.snap)), scale=FALSE,scannf = FALSE, nf = 3) .e <- environment() scree = fviz_screeplot(pcares, addlabels = TRUE ) print(scree) ind.plot.clear = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$location, pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + theme_pubr(margin = FALSE, base_size = basesize) +scale_fill_d3(palette = "category20") + labs(fill = "Location", color = "Location") ind.plot.clear$plot_env = .e print(ind.plot.clear) ind.plot.clear.add = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$location, pointsize=5, pointshape=21, alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + theme_pubr(margin = FALSE, base_size = basesize) + scale_fill_d3(palette = "category20")+ scale_color_d3(palette = "category20")+ labs(fill = "Location", color = "Location") ind.plot.clear.add$plot_env = .e print(ind.plot.clear.add) #special plot for paper special.color = c("#4DBBD5FF", "#E64B35FF", "#3C5488FF", "#4DBBD5FF","#3C5488FF", "#3C5488FF", "#E64B35FF", "#4DBBD5FF", "#E64B35FF","#4DBBD5FF", "#3C5488FF", "#3C5488FF", "#E64B35FF", "#4DBBD5FF", "#E64B35FF", "#3C5488FF", "#3C5488FF" ) in.plot.clear.path = c() if(FALSE){ basesize=14 text.size=5 ind.plot.clear.path = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$location, pointsize=5, pointshape=21, alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.75,invisible = "quali", ellipse.alpha=0.05) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + theme_pubr(margin = FALSE, base_size = basesize) + scale_fill_d3(palette = "category20")+ scale_color_d3(palette = "category20")+ labs(fill = "Location", color = "Location") + geom_segment(aes(xend=c(tail(pcares$li$Axis1, n=-1), NA), yend=c(tail(pcares$li$Axis2, n=-1), NA)), arrow=arrow(angle = 15,length=unit(0.5,"cm"), type="closed"), linetype="longdash", size =1.2 ,alpha=0.7, color = special.color) ind.plot.clear.path$plot_env = .e print(ind.plot.clear.path) } ind.plot = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$geo2, pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + geom_text_repel(aes(label = ok$location), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) +scale_fill_d3(palette = "category20")+ labs(fill = "Location") ind.plot$plot_env = .e print(ind.plot) ind.plot.2 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$geo2, pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.6, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + geom_text(aes(label = ok$season), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) +scale_fill_d3(palette = "category20")+ labs(fill = "Geo") ind.plot.2$plot_env = .e print(ind.plot.2) ind.plot.3 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$geo2, pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + geom_text_repel(aes(label = ok$samplenames), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) +scale_fill_d3(palette = "category20")+ labs(fill = "Geo") ind.plot.3$plot_env = .e print(ind.plot.3) ind.plot.4 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$location, pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.6, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + geom_text_repel(aes(label = as.character(ok$date.start)), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) +scale_fill_d3(palette = "category20")+ labs(fill = "Geo") ind.plot.4$plot_env = .e print(ind.plot.4) ind.plot.5 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$season, pointsize=4, pointshape=21, palette="npg", alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + geom_text(aes(label = as.character(ok$geo2)), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) + labs(fill = "Season") ind.plot.5$plot_env = .e print(ind.plot.5) ind.plot.6 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = as.character(ok$batch), pointsize=4, pointshape=21, alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + geom_text(aes(label = ok$batch), size=text.size) + scale_fill_d3(palette = "category20")+ theme_pubr(margin = FALSE, base_size = basesize) ind.plot.6$plot_env = .e print(ind.plot.6) ind.plot.7 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = as.character(ok$location), pointsize=4, pointshape=21, alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + scale_fill_d3(palette = "category20")+ theme_pubr(margin = FALSE, base_size = basesize) + labs(fill = "Location") ind.plot.7$plot_env = .e print(ind.plot.7) ind.plot.8 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = as.character(ok$location), pointsize=4, pointshape=21, alpha.ind = 0.8, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + scale_fill_d3(palette = "category20")+ geom_text_repel(aes(label = ok$location), size=text.size) + theme_pubr(margin = FALSE, base_size = basesize) + labs(fill = "Location") ind.plot.8$plot_env = .e print(ind.plot.8) chem.special = c("Tricholoma", "Trametes", "Serpula","Psychrobacter", "Fomitopsis", "Pseudogymnoascus") names.features = c(head(rownames(pcares$c1)[order(pcares$c1$CS1)],10),tail(rownames(pcares$c1)[order(pcares$c1$CS1)],10), head(rownames(pcares$c1)[order(pcares$c1$CS2)],7),tail(rownames(pcares$c1)[order(pcares$c1$CS2)],7) ) names.features = unique(names.features) gradient.color = c("#3C5488FF","#4DBBD5FF","#E64B35FF") #varplots var = get_pca_var(pcares) #another way of picking names, based on contributions to axes var$contrib = var$contrib[order(var$contrib[,1]^2,var$contrib[,2]^2, var$contrib[,3]^2, decreasing = TRUE),] #names.features = row.names(var$contrib[1:50,]) mm = corrplot::corrplot(var$contrib[1:50,], is.corr=FALSE, cl.align.text = "l") contrib = fviz_contrib(pcares, choice = "var", axes = 1:3, top = 100,ggtheme = theme_pubr(base_size = 8)) var.plot = fviz_pca_var(pcares, geom=c("text", "point"), select.var= list(name=c(names.features)), col.var="contrib", labelsize = text.size+1, #gradient.cols = "npg", gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"), repel = TRUE) + theme_pubr(margin = FALSE, base_size = basesize+4)#plot features(loadings) var.plot$plot_env = .e print(var.plot) names.biplot = c(head(rownames(pcares$c1)[order(pcares$c1$CS1)],4),tail(rownames(pcares$c1)[order(pcares$c1$CS1)],8), head(rownames(pcares$c1)[order(pcares$c1$CS2)],3),tail(rownames(pcares$c1)[order(pcares$c1$CS2)],6) ) #biplot bi.plot = fviz_pca_biplot(pcares, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = ok$geo2, #col.ind = fourpeople.select$ownership, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 5, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.3 , alpha.ind = 0.7, repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, #addEllipses = TRUE, #ellipse.type = "euclid", #ellipse.level = 0.60, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Geo", color = "Contrib", alpha = "Contrib") + scale_fill_d3(palette = "category20") + coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4) + #geom_text_repel(aes(label = as.character(ok$location)), size=4) + geom_segment(aes(xend=c(tail(pcares$li$Axis1, n=-1), NA), yend=c(tail(pcares$li$Axis2, n=-1), NA), ), arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"), linetype=1, size=1, alpha=0.7, color = "steelblue")+ theme_pubr(margin = FALSE, base_size = basesize) bi.plot$plot_env = .e print(bi.plot) #biplot bi.plot2 = fviz_pca_biplot(pcares, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = ok$geo2, #col.ind = ok$season, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 5, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.2 , alpha.ind = 0.7, repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, #addEllipses = TRUE, #ellipse.type = "euclid", #ellipse.level = 0.60, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Geo", color = "Contrib", alpha = "Contrib") + scale_fill_d3(palette = "category20") + coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4) + theme_pubr(margin = FALSE, base_size = basesize) bi.plot2$plot_env = .e print(bi.plot2) bi.plot3 = fviz_pca_biplot(pcares, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = ok$location, #col.ind = ok$season, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 5, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.2 , alpha.ind = 0.7, repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, addEllipses = TRUE, #ellipse.type = "euclid", ellipse.level = 0.8, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Location", color = "Contrib", alpha = "Contrib") + coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + scale_fill_d3(palette = "category20")+ #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4) + theme_pubr(margin = FALSE, base_size = basesize) bi.plot3$plot_env = .e print(bi.plot3) #full spectrum arrows colfunc<-colorRampPalette(c("red","yellow","green","royalblue")) #sublt spectrum #colfunc = colorRampPalette(rainbow_hcl(4)) #simplified arrows #colfunc<-colorRampPalette(c("#E64B35FF","#00A087FF", "#3C5488FF")) bi.plot4 = fviz_pca_biplot(pcares, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = ok$location, #col.ind = fourpeople.select$ownership, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 5, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.2 , alpha.ind = 0.7, repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, #addEllipses = TRUE, #ellipse.type = "euclid", #ellipse.level = 0.60, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Geo", color = "Contrib", alpha = "Contrib") + coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + scale_fill_d3(palette = "category20")+ #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4) + #geom_text_repel(aes(label = as.character(ok$location)), size=4) + geom_segment(aes(xend=c(tail(pcares$li$Axis1, n=-1), NA), yend=c(tail(pcares$li$Axis2, n=-1), NA), ), arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"), linetype=1, size=1, alpha=0.5, color = colfunc(length(pcares$li$Axis1)))+ theme_pubr(margin = FALSE, base_size = basesize) bi.plot4$plot_env = .e print(bi.plot4) plots = list(scree=scree, ind.plot.clear=ind.plot.clear, ind.plot.clear.add = ind.plot.clear.add, ind.plot.clear.path = ind.plot.clear.path, ind.plot=ind.plot, ind.plot.2=ind.plot.2, ind.plot.3=ind.plot.3, ind.plot.4=ind.plot.4, ind.plot.5=ind.plot.5, ind.plot.6=ind.plot.6, ind.plot.7=ind.plot.7, ind.plot.8 = ind.plot.8, var.plot=var.plot, bi.plot=bi.plot, bi.plot2=bi.plot2, bi.plot3=bi.plot3, bi.plot4=bi.plot4, contrib=contrib) l = mget(names(plots)) #ggsave("~/Documents/Bioinfo/DNAformal/arrange2x2.pdf", marrangeGrob(grobs = l, nrow=1, ncol=1), height = 8, width = 8) return(plots) } #this function is not good with too many locations scores.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2016-06-01", "%Y-%m-%d" ) & date.start < as.Date("2016-08-01", "%Y-%m-%d" )) scores.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2016-06-14", "%Y-%m-%d" ) & date.start < as.Date("2016-08-14", "%Y-%m-%d" )) scores.testing.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2016-05-01", "%Y-%m-%d" ) & date.start < as.Date("2016-09-01", "%Y-%m-%d" )) #purely testing scores.snap = subset(scores_single_complete, geo2 %in% c("Asia", "Europe", "Northeast", "oceania", "Southeast", "Southwest")) #== "Mike" & date.start >= as.Date("2015-12-01", "%Y-%m-%d" ) & date.start < as.Date("2016-05-01", "%Y-%m-%d" )) #for mike #2015-07-01 to 2015-08-01 #2016-02-01 to 2016-04-01 #2016-01-01 to 2017-01-01 ! #2016-06-01 to 2016-08-01 ! #2015-12-01 to 2016-05-01 ! plots = pcasnapshot(scores.snap, text.size=5) plots.p1 = pcasnapshot(Patient1, text.size=5) plots.chemi = pcasnapshot(scores_chemi, text.size=5) ``` ##PCA for all samples2, plotting function for more samples ```{r PCA for all samples2} pcaall = function(ok, text.size=5, base.size=14){ #ok = scores.snap #text.size = 5 #ok = scores_single_complete ok = data.frame(ok) master.cpm.snap = master.cpm.single[, ok$samplenames] pcares = dudi.pca(t(asinh(master.cpm.snap)), scale=FALSE,scannf = FALSE, nf = 3) .e <- environment() scree = fviz_screeplot(pcares, addlabels = TRUE ) print(scree) ind.plot.geo = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$geo2, pointsize=3, pointshape=21, palette="lancet", alpha.ind = 0.6, addEllipses =FALSE, ellipse.level = 0.68 ,invisible = "quali") +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + scale_fill_d3(palette = c("category20")) + theme_pubr(margin = FALSE, base_size = base.size+2)+ labs(fill = "Geo", color = "Geo") ind.plot.geo$plot_env = .e print(ind.plot.geo) #this part has bug when there is only one geographic location ind.plot.geo.add = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$geo2, pointsize=3, pointshape=21, palette="lancet", alpha.ind = 0.6, addEllipses =FALSE, ellipse.level = 0.68 ,invisible = "quali") +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + scale_color_d3(palette = c("category20")) + scale_fill_d3(palette = c("category20")) + theme_pubr(margin = FALSE, base_size = base.size+2) + labs(fill = "Geo", color = "Geo") ind.plot.geo.add$plot_env = .e print(ind.plot.geo.add) ind.plot.season = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$season, pointsize=3, pointshape=21, palette=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), alpha.ind = 0.6, addEllipses = FALSE, ellipse.level = 0.68,invisible = "quali") +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + theme_pubr(margin = FALSE, base_size = base.size+2) + scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))+ labs(fill = "Season", color = "Season") ind.plot.season$plot_env = .e print(ind.plot.season) ind.plot.season.add = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = ok$season, pointsize=3, pointshape=21, palette=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), alpha.ind = 0.6, addEllipses = TRUE, ellipse.level = 0.68,invisible = "quali") +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + theme_pubr(margin = FALSE, base_size = base.size) + scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) + labs(fill = "Season", color = "Season") ind.plot.season.add$plot_env = .e print(ind.plot.season.add) ind.plot.2 = fviz_pca_ind(pcares, axes = c(1,2), label="none", fill.ind = as.character(ok$batch), pointsize=3, pointshape=21, alpha.ind = 0.6, addEllipses =FALSE, ellipse.level = 0.68,invisible = "quali",) +coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + geom_text(aes(label = ok$batch), size=text.size) + scale_fill_d3(palette = "category20")+ theme_pubr(margin = FALSE, base_size = base.size+2) + labs(fill = "Batch", color = "Batch") ind.plot.2$plot_env = .e print(ind.plot.2) chem.special = c("Tricholoma", "Trametes", "Serpula","Psychrobacter", "Fomitopsis", "Pseudogymnoascus") names.features = c(head(rownames(pcares$c1)[order(pcares$c1$CS1)],20),tail(rownames(pcares$c1)[order(pcares$c1$CS1)],20), head(rownames(pcares$c1)[order(pcares$c1$CS2)],7),tail(rownames(pcares$c1)[order(pcares$c1$CS2)],7) ) names.features = unique(names.features) gradient.color = c("#3C5488FF","#4DBBD5FF","#E64B35FF") #varplots var = get_pca_var(pcares) #another way of picking names, based on contributions to axes var$contrib = var$contrib[order(var$contrib[,1]^2,var$contrib[,2]^2, var$contrib[,3]^2, decreasing = TRUE),] #names.features = row.names(var$contrib[1:50,]) mm = corrplot::corrplot(var$contrib[1:50,], is.corr=FALSE, cl.align.text = "l") contrib = fviz_contrib(pcares, choice = "var", axes = 1:3, top = 100,ggtheme = theme_pubr(base_size = 6)) print(contrib) var.plot = fviz_pca_var(pcares, geom=c("text","arrow"), select.var= list(name=c(names.features)), col.var="contrib", labelsize = text.size, #gradient.cols = "npg", gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"), repel = TRUE) + theme_pubr(margin = FALSE, base_size = base.size-4)#plot features(loadings) var.plot$plot_env = .e print(var.plot) var.plot2 = fviz_pca_var(pcares, geom=c("text"), select.var= list(name=c(names.features)), col.var="contrib", labelsize = text.size, #gradient.cols = "npg", gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"), repel = TRUE) + theme_pubr(margin = FALSE, base_size = base.size-4)#plot features(loadings) var.plot2$plot_env = .e print(var.plot2) names.biplot = c(head(rownames(pcares$c1)[order(pcares$c1$CS1)],4),tail(rownames(pcares$c1)[order(pcares$c1$CS1)],8), head(rownames(pcares$c1)[order(pcares$c1$CS2)],3),tail(rownames(pcares$c1)[order(pcares$c1$CS2)],6) ) #biplot bi.plot = fviz_pca_biplot(pcares, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = ok$geo2, #col.ind = fourpeople.select$ownership, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 5, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.3 , alpha.ind = 0.7, repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, #addEllipses = TRUE, #ellipse.type = "euclid", #ellipse.level = 0.60, invisible = "quali", #mean.point = FALSE )+ scale_fill_d3(palette = c("category20")) + labs(fill = "Geo", color = "Contrib", alpha = "Contrib") + coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + theme_pubr(base_size = base.size-2) bi.plot$plot_env = .e print(bi.plot) #biplot bi.plot2 = fviz_pca_biplot(pcares, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = ok$season, #col.ind = ok$season, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 5, palette = c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), #addEllipses = TRUE, # Variables alpha.var =0.3 , alpha.ind = 0.7, repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, #addEllipses = TRUE, #ellipse.type = "euclid", #ellipse.level = 0.60, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Season", color = "Contrib", alpha = "Contrib") + coord_fixed(sqrt(pcares$eig[2] / pcares$eig[1])) + scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) + #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4) + theme_pubr(base_size = base.size) bi.plot2$plot_env = .e print(bi.plot2) plots = list(scree=scree, ind.plot.geo=ind.plot.geo, ind.plot.geo.add=ind.plot.geo.add, ind.plot.season = ind.plot.season, ind.plot.season.add = ind.plot.season.add, ind.plot.2=ind.plot.2, var.plot=var.plot, var.plot2=var.plot2, contrib = contrib, bi.plot=bi.plot, bi.plot2=bi.plot2) l = mget(names(plots)) #ggsave("~/Documents/Bioinfo/DNAformal/arrange2x2.pdf", marrangeGrob(grobs = l, nrow=1, ncol=1), height = 8, width = 8) return(plots) } plots.all = pcaall(scores_single_complete, text.size=5, base.size = 14) plots.mike = pcaall(Mike_single, text.size=4, base.size = 14) #disable 2nd plot add.ellipse plots.campushome = pcaall(campushome, text.size=4, base.size =14) plots.noncampushome = pcaall(noncampushome, text.size=4, base.size =14) #special interest P2.select = subset(Patient1, location == "P2_work_home" | location == "Campus" | location == "Monterey-CA") plots.p1 = pcaall(P2.select, text.size=5, base.size =14) plots.p3 = pcaall(Patient3, text.size=5, base.size=14) plots.scores.snap = pcaall(scores.snap, text.size=5, base.size =14) plots.chemi2 = pcaall(scores_chemi, text.size=5, base.size=14) ``` ##near - not near location profile comparison ```{r near-not near} distsnapshot = function(profile, text.size=4){ profile = data.frame(profile) master.cpm.snap = master.cpm.single[, profile$samplenames] bray.dist = vegdist(t(asinh(master.cpm.snap)), na.rm = TRUE, method= "bray") bray.dist.matrix = as.matrix(bray.dist) rownames(bray.dist.matrix) = profile$samplenames colnames(bray.dist.matrix) = profile$samplenames similar.dist = c() dissimilar.dist = c() #this is only for mike simterms = c("Campus-home", "Campus", "Hiking_weekend","Hiking _weekend","Mike_office-background", "Mike_home-background", "Mike_background" ) for (i in 1:(nrow(profile)-1)){ if (profile$location[i] %in% simterms & profile$location[i+1] %in% simterms | profile$location[i] == profile$location[i+1]){ #if (profile$location[i] == profile$location[i+1]){ similar.dist = c(similar.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[i+1]]) } else if (profile$location[i] != profile$location[i+1]){ dissimilar.dist = c(dissimilar.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[i+1]]) } else { print(paste0(profile$location[i]," ", profile$location[i+1]))} } same.season.dist = c() different.season.dist = c() control = 0 for (i in 1:(nrow(profile)-1)){ for (j in (i+1):nrow(profile)){ control = control + 1 if (profile$season[i] == profile$season[j] ){ same.season.dist = c(same.season.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } else if (profile$season[i] != profile$season[j] & abs(profile$date.start[i] - profile$date.start[j]) >= 14){ different.season.dist = c(different.season.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } } } print(control) return(list(similar.dist = similar.dist, dissimilar.dist = dissimilar.dist, location.wilcox=wilcox.test(similar.dist, dissimilar.dist), same.season.dist = same.season.dist, different.season.dist = different.season.dist, season.wilcox = wilcox.test(same.season.dist, different.season.dist))) } scores.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-06-01", "%Y-%m-%d" )) dist.result = distsnapshot(profile = scores.snap) #less time to get better results for seasonal? #lets do it year by year scores.snap.one = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-08-01", "%Y-%m-%d" ) & date.start < as.Date("2015-08-01", "%Y-%m-%d" )) dist.result = distsnapshot(profile = scores.snap.one) scores.snap.one = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2015-08-01", "%Y-%m-%d" ) & date.start < as.Date("2016-08-01", "%Y-%m-%d" )) dist.result = distsnapshot(profile = scores.snap.one) #Patient 1 scores.snap.one = subset(scores_single_complete, ownership == "No1_1636-69-063") dist.result = distsnapshot(profile = scores.snap.one) dist.location.df = data.frame(distance = as.numeric(c(dist.result$similar.dist, dist.result$dissimilar.dist)), group = c(rep("Same location", length(dist.result$similar.dist)), rep("Different locations", length(dist.result$dissimilar.dist)))) #this is statistically significant, but not very cool to look at dist.season.df = data.frame(distance = as.numeric(c(dist.result$same.season.dist, dist.result$different.season.dist)), group = c(rep("Same season", length(dist.result$same.season.dist)), rep("Different season", length(dist.result$different.season.dist)))) ggdensity(dist.location.df, x = "distance", add = "median", #add.params=list(color="red"), rug = TRUE, color = "group", fill = "group", palette = c("#00AFBB", "#E7B800"), ) + labs(x="Bray-Curtis Distance") + theme_pubr(base_size = 16) ggsave("~/Documents/Bioinfo/DNAformal/plots/location.pdf", width=6, height=4) ggdensity(dist.season.df, x = "distance", add = "median", #add.params=list(color="red"), rug = TRUE, color = "group", fill = "group", palette = c("#00AFBB", "#E7B800"), ) + labs(x="Bray-Curtis Distance") + theme_pubr(base_size = 16) ggsave("~/Documents/Bioinfo/DNAformal/plots/season.pdf", width=6, height=4) #ggplot(data.frame(distance = as.numeric(c(dist.result$similar.dist, dist.result$dissimilar.dist)), group = c(rep("Nearby locations", length(dist.result$similar.dist)), rep("Distant locations", length(dist.result$dissimilar.dist)))), aes(x=distance, color=group, fill=group)) + geom_density(alpha=0.4) + scale_color_npg()+ theme_pubr() ``` ##campushome vs noncampushome ```{r campushome vs non-campushome} campushomesnapshot = function(profile, text.size=5, pick=1){ #profile = scores.snap.mike profile = data.frame(profile) master.cpm.snap = master.cpm.single[, profile$samplenames] bray.dist = vegdist(t(asinh(master.cpm.snap)), na.rm = TRUE, method= "bray") bray.dist.matrix = as.matrix(bray.dist) rownames(bray.dist.matrix) = profile$samplenames colnames(bray.dist.matrix) = profile$samplenames campushome.dist = c() dissimilar.dist = c() noncampushome.dist = c() #this is only for mike simterms = c("Campus-home", "Campus", "Hiking_weekend","Hiking _weekend","Mike_office-background", "Mike_home-background", "Mike_background" ) for (i in 1:(nrow(profile)-1)){ for (j in (i+1):nrow(profile)){ if ((profile$location[i] %in% simterms & profile$location[j] %in% simterms) | profile$location[i] == profile$location[j]){ #if (profile$location[i] == profile$location[i+1]){ campushome.dist = c(campushome.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } else if (profile$location[i] != profile$location[j] & (profile$location[i] %in% simterms | profile$location[j] %in% simterms)){ dissimilar.dist = c(dissimilar.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } else {} if (!(profile$location[i] %in% simterms) & !(profile$location[j] %in% simterms) & profile$location[i] != profile$location[j]){ noncampushome.dist = c(noncampushome.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } } } seasons = c("spring","summer", "fall", "winter") seasonpick = seasons[pick] same.season.dist = c() different.season.dist = c() control = 0 for (i in 1:(nrow(profile)-1)){ for (j in (i+1):nrow(profile)){ control = control + 1 #skipping fall because it sucks if (profile$season[i] == profile$season[j] & profile$season[i] == seasonpick ){ same.season.dist = c(same.season.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } else if (profile$season[i] != profile$season[j] & abs(profile$date.start[i] - profile$date.start[j]) >= 0 & (profile$season[i] == seasonpick | profile$season[j] == seasonpick)){ #print(paste0(profile$season[i],"_", profile$season[j])) different.season.dist = c(different.season.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } } } same.season.all.dist = c() different.season.all.dist = c() for (i in 1:(nrow(profile)-1)){ for (j in (i+1):nrow(profile)){ control = control + 1 #skipping fall because it sucks if (profile$season[i] == profile$season[j]){ same.season.all.dist = c(same.season.all.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } else if (profile$season[i] != profile$season[j] & abs(profile$date.start[i] - profile$date.start[j]) >= 14 ){ #print(paste0(profile$season[i],"_", profile$season[j])) different.season.all.dist = c(different.season.all.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } } } #print(control) return(list(campushome.dist = campushome.dist, noncampushome.dist = noncampushome.dist, dissimilar.dist = dissimilar.dist, location.wilcox=wilcox.test(campushome.dist, dissimilar.dist), location2.wilcox = wilcox.test(campushome.dist, noncampushome.dist), location3.wilcox = wilcox.test(noncampushome.dist, dissimilar.dist), same.season.dist = same.season.dist, different.season.dist = different.season.dist, same.season.all.dist = same.season.all.dist, different.season.all.dist = different.season.all.dist, season.wilcox = wilcox.test(same.season.dist, different.season.dist), season.all.wilcox = wilcox.test(same.season.all.dist, different.season.all.dist) )) } #removes season functions, for smallter time-frames campushomesimp = function(profile, text.size=5, pick=1){ #profile = scores.snap.mike profile = data.frame(profile) master.cpm.snap = master.cpm.single[, profile$samplenames] bray.dist = vegdist(t(asinh(master.cpm.snap)), na.rm = TRUE, method= "bray") bray.dist.matrix = as.matrix(bray.dist) rownames(bray.dist.matrix) = profile$samplenames colnames(bray.dist.matrix) = profile$samplenames campushome.dist = c() dissimilar.dist = c() noncampushome.dist = c() #this is only for mike simterms = c("Campus-home", "Campus", "Hiking_weekend","Hiking _weekend","Mike_office-background", "Mike_home-background", "Mike_background" ) for (i in 1:(nrow(profile)-1)){ for (j in (i+1):nrow(profile)){ if ((profile$location[i] %in% simterms & profile$location[j] %in% simterms) | profile$location[i] == profile$location[j]){ #if (profile$location[i] == profile$location[i+1]){ campushome.dist = c(campushome.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } else if (profile$location[i] != profile$location[j] & (profile$location[i] %in% simterms | profile$location[j] %in% simterms)){ dissimilar.dist = c(dissimilar.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } else {} if (!(profile$location[i] %in% simterms) & !(profile$location[j] %in% simterms) & profile$location[i] != profile$location[j]){ noncampushome.dist = c(noncampushome.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } } } return(list(campushome.dist = campushome.dist, noncampushome.dist = noncampushome.dist, dissimilar.dist = dissimilar.dist, location.wilcox=wilcox.test(campushome.dist, dissimilar.dist), location2.wilcox = wilcox.test(campushome.dist, noncampushome.dist), location3.wilcox = wilcox.test(noncampushome.dist, dissimilar.dist) )) } #all time scores.snap.mike = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-06-01", "%Y-%m-%d" )) dist.result = campushomesnapshot(profile = scores.snap.mike, pick = 3) #less time to get better results for seasonal? #lets do it year by year, it seems 2nd year's data is better scores.snap.one = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-08-01", "%Y-%m-%d" ) & date.start < as.Date("2015-08-01", "%Y-%m-%d" )) dist.result = campushomesnapshot(profile = scores.snap.one) scores.snap.one = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2015-08-01", "%Y-%m-%d" ) & date.start < as.Date("2016-08-01", "%Y-%m-%d" )) dist.result = campushomesnapshot(profile = scores.snap.one, pick=4) #### #cherry picking time.frame scores.snap.one = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2016-06-01", "%Y-%m-%d" ) & date.start < as.Date("2016-08-01", "%Y-%m-%d" )) dist.result = campushomesimp(profile = scores.snap.one) #Patient 1 scores.snap.one = subset(scores_single_complete, ownership == "No1_1636-69-063") dist.result = campushomesnapshot(profile = scores.snap.one[,], pick=4) #purely testing, only campushome, trying to show seasonal effects scores.snap.campushome = subset(campushome, ownership == "Mike" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-02-01", "%Y-%m-%d" )) #distances based on four seasons fourseasons = function(profile){ #profile = scores.snap.campushome profile = scores_chemi seasons = factor(unique(profile$season), levels = c("spring", "summer", "fall", "winter")) dist.objects = list() if ("spring" %in% profile$season){ dist.result.spring = campushomesnapshot(profile = profile,pick=1) dist.objects[["spring"]] = dist.result.spring } if ("summer" %in% profile$season){ dist.result.summer = campushomesnapshot(profile = profile,pick=2) dist.objects[["summer"]] = dist.result.summer } if ("fall" %in% profile$season){ dist.result.fall = campushomesnapshot(profile = profile,pick=3) dist.objects[["fall"]] = dist.result.fall } if ("winter" %in% profile$season){ dist.result.winter = campushomesnapshot(profile = profile,pick=4) dist.objects[["winter"]] = dist.result.winter } #dist.objects = list(spring = dist.result.spring, summer = dist.result.summer, fall = dist.result.fall, winter = dist.result.winter) #dist.objects = list(spring = dist.result.spring, fall = dist.result.fall, winter = dist.result.winter) #for Patient1 #seasons = c("spring", "fall", "winter") #for Patient1 distance = c() group = c() season = c() dist.season.df = data.frame(distance = as.numeric(), group = as.character(), season = as.character()) for ( i in 1:length(seasons)){ #print(length(object$same.season.dist)) #print(length(object$different.season.dist)) print(seasons[i]) print(dist.objects[[seasons[i]]]$season.wilcox) dist.season.df = rbind.data.frame(dist.season.df, data.frame(distance = as.numeric(c(dist.objects[[get(seasons[i])]]$same.season.dist, dist.objects[[get(seasons[i])]]$different.season.dist)), group = c(rep("Same season", length(dist.objects[[get(seasons[i])]]$same.season.dist)), rep("Different season", length(dist.objects[[get(seasons[i])]]$different.season.dist))), season = rep(seasons[i], length(dist.objects[[get(seasons[i])]]$same.season.dist)+length(dist.objects[[get(seasons[i])]]$different.season.dist)))) } dist.season.df$group = factor(dist.season.df$group, levels = c("Same season", "Different season")) dist.season.df$season = factor(dist.season.df$season, levels = c("spring","summer", "fall", "winter")) return(dist.season.df) } dist.fourseasons.df = fourseasons(scores.snap.campushome) dist.fourseasons.df = fourseasons(Patient1[-30,]) dist.fourseasons.df = fourseasons(scores.snap.mike) dist.fourseasons.df = fourseasons(scores_single_complete) dist.fourseasons.df = fourseasons(scores_chemi) ggplot(dist.fourseasons.df, aes(x=season, y=distance)) + geom_boxplot(aes(fill=group), position=position_dodge(0.6), width=0.5) + #geom_beeswarm(aes(group=group), dodge.width=0.5, size=0.1, alpha=0.5)+ scale_fill_manual(values=c("#00AFBB", "#E7B800", "#3C5488FF")) + #scale_color_manual(values=c("#00AFBB", "#E7B800", "#3C5488FF")) + labs(y="Bray-Curtis distance") + theme_pubr(base_size = 16, legend = "bottom") dist.location.df = data.frame(distance = as.numeric(c(dist.result$campushome.dist, dist.result$dissimilar.dist)), group = c(rep("Within Campus", length(dist.result$campushome.dist)), rep("Campus vs Others", length(dist.result$dissimilar.dist)))) dist.location.df = data.frame(distance = as.numeric(c(dist.result$campushome.dist, dist.result$dissimilar.dist, dist.result$noncampushome.dist)), group = c(rep("Within Campus", length(dist.result$campushome.dist)), rep("Campus vs others", length(dist.result$dissimilar.dist)), rep("Within others", length(dist.result$noncampushome.dist)))) #this is statistically significant, but not very cool to look at as density plot, boxplot + beeswarm is probably a better choice dist.season.df = data.frame(distance = as.numeric(c(dist.result$same.season.dist, dist.result$different.season.dist)), group = c(rep("Same season", length(dist.result$same.season.dist)), rep("Different season", length(dist.result$different.season.dist)))) dist.season.df$group = factor(dist.season.df$group, levels=c("Same season", "Different season")) ggdensity(dist.location.df, x = "distance", add = "median", #add.params=list(color="red"), rug = TRUE, color = "group", fill = "group", palette = c("#00AFBB", "#E7B800", "#3C5488FF"), ) + labs(x="Bray-Curtis Distance") + theme_pubr(base_size = 16) ggsave("~/Documents/Bioinfo/DNAformal/plots/location.pdf", width=7, height=5) ggdensity(dist.season.df, x = "distance", add = "median", #add.params=list(color="red"), rug = TRUE, color = "group", fill = "group", palette = c("#00AFBB", "#E7B800"), ) + labs(x="Bray-Curtis Distance") + theme_pubr(base_size = 16) ggsave("~/Documents/Bioinfo/DNAformal/plots/season.pdf", width=6, height=4) #testing ggplot(dist.season.df, aes(x=group, y=distance, color=group, fill=group)) + geom_boxplot(alpha=0.4) + #geom_beeswarm(alpha=0.3, size=0.02) + theme_pubr() + scale_color_npg() ``` ##P1 per genus campus vs non-campus comparisons ```{r genus campus vs non-campus} master.asinh.cpm.single.fifty = master.asinh.cpm.single[rowSums(master.asinh.cpm.single > 0) >= 50,] #limited to p1 genus.campusornot = cbind.data.frame(t(master.asinh.cpm.single.fifty[, Mike_single$samplenames]), campus_ornot = Mike_single$campus_ornot) genus.campusornot.melt = melt(genus.campusornot, id.vars = "campus_ornot") genus.campusornot.result = compare_means( formula = value ~ campus_ornot, data = genus.campusornot.melt, group.by = "variable", method = "wilcox.test", p.adjust.method = "fdr") genus.campusornot.sig = genus.campusornot[, c(genus.campusornot.result$p.adj < 0.05, TRUE)] genus.campusornot.result.sig = genus.campusornot.result[genus.campusornot.result$p.adj < 0.05,] genus.campusornot.result.sig$variable = as.character(genus.campusornot.result.sig$variable) genus.campusornot.result.sig = genus.campusornot.result.sig[order(genus.campusornot.result.sig$p.adj),] #rank names by inverse order of p.adjust, first one is the smallest genus.select = as.character(genus.campusornot.result.sig$variable) genus.special.interests = c("Ophiognomonia","Cladosporium", "Preussia", "Penicillium", "Aspergillus", "Gymnopus", "Betula", "Triticum", "Canis", "Felis", "Streptococcus", "Corynebacterium", "Staphylococcus", "Geobacillus", "Segetibacter") genus.campusornot.sig.melt = melt(genus.campusornot.sig[, as.character(c(genus.select, "campus_ornot"))], id.vars = "campus_ornot") #pick and plot genus.campusornot.sig.melt = melt(genus.campusornot.sig[,as.character(c(genus.special.interests, "campus_ornot"))], id.vars = "campus_ornot") #formal plots ggplot(genus.campusornot.sig.melt, aes(x = campus_ornot , y = value, color=campus_ornot, fill=campus_ornot)) + geom_boxplot(size=0.6, alpha=0.5)+ geom_quasirandom(size=0.6, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ #geom_beeswarm(size=0.6, alpha=0.3, color = "black", fill="black", cex=2, width=0.1)+ #beeswarm has issues with 0s stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) + theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ scale_color_manual(values = rev(c("#00AFBB", "#E7B800"))) + scale_fill_manual(values = rev(c("#00AFBB", "#E7B800")))+ theme(strip.background = element_blank()) + rremove("x.text") + labs(color = "Location", fill="Location") + facet_wrap(~variable, scales="free_y", ncol=5) #expa chisq.test(x=rbind(c(73/2,73/2),c(57,16))) #fungi phyla distribution test #plotting all p values genus.campusornot.df = genus.campusornot.sig[, as.character(c(genus.select, "campus_ornot"))] mediandiff= apply(genus.campusornot.df[, 1:100], 2, function(x){mean(x[genus.campusornot.df$campus_ornot == "Campus"]) - mean(x[genus.campusornot.df$campus_ornot != "Campus"])}) #mean difference, to know which one is higher genus.campusornot.result.sig$variable = factor(genus.campusornot.result.sig$variable, levels = c(genus.campusornot.result.sig$variable)) ggplot(cbind.data.frame(genus.campusornot.result.sig, mdiff = mediandiff), aes(x = variable, y=p.adj, color=mdiff)) + geom_point() + theme_pubr(base_size=16, x.text.angle = 60) + theme(axis.text.x = element_text(size = 6)) + scale_color_gradient2(low = muted("red"), mid = "white", high = muted("blue"), midpoint = 0, space = "Lab", na.value = "grey50", guide = "colourbar") ``` ##P2 and P3 per genus home vs non-home comparisons ```{r genus home vs non-home} master.asinh.cpm.single.fifty = master.asinh.cpm.single[rowSums(master.asinh.cpm.single > 0) >= 50,] #limited to p2, no significant stuff were detected Patient1.meta = Patient1 Patient1.meta$home = ifelse(Patient1$location == "P2_work_home", "home", "non-home") genus.homeornot = cbind.data.frame(t(master.asinh.cpm.single.fifty[, Patient1.meta$samplenames]), home_ornot = Patient1.meta$home) genus.homeornot.melt = melt(genus.campusornot, id.vars = "home_ornot") genus.homeornot.result = compare_means( formula = value ~ home_ornot, data = genus.homeornot.melt, group.by = "variable", method = "wilcox.test", p.adjust.method = "fdr") genus.homeornot.sig = genus.homeornot[, c(genus.homeornot.result$p.adj < 0.05, TRUE)] genus.homeornot.result.sig = genus.homeornot.result[genus.homeornot.result$p.adj < 0.05,] genus.homeornot.result.sig$variable = as.character(genus.homeornot.result.sig$variable) genus.homeornot.result.sig = genus.homeornot.result.sig[order(genus.homeornot.result.sig$p.adj),] #rank names by inverse order of p.adjust, first one is the smallest genus.select = as.character(genus.homeornot.result.sig$variable) genus.special.interests = c("Ophiognomonia","Cladosporium", "Preussia", "Penicillium", "Aspergillus", "Gymnopus", "Betula", "Triticum", "Canis", "Felis", "Streptococcus", "Corynebacterium", "Staphylococcus", "Geobacillus", "Segetibacter") genus.homeornot.sig.melt = melt(genus.homeornot.sig[, as.character(c(genus.select, "home_ornot"))], id.vars = "home_ornot") #pick and plot genus.homeornot.sig.melt = melt(genus.homeornot.sig[,as.character(c(genus.special.interests, "home_ornot"))], id.vars = "home_ornot") #formal plots ggplot(genus.homeornot.melt, aes(x = home_ornot , y = value, color=home_ornot, fill=home_ornot)) + geom_boxplot(size=0.6, alpha=0.5)+ geom_quasirandom(size=0.6, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ #geom_beeswarm(size=0.6, alpha=0.3, color = "black", fill="black", cex=2, width=0.1)+ #beeswarm has issues with 0s stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) + theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ scale_color_manual(values = rev(c("#00AFBB", "#E7B800"))) + scale_fill_manual(values = rev(c("#00AFBB", "#E7B800")))+ theme(strip.background = element_blank()) + rremove("x.text") + labs(color = "Location", fill="Location") + facet_wrap(~variable, scales="free_y", ncol=5) #expa ggplot(genus.homeornot, aes(x = home_ornot , y = Raoultella, color=home_ornot, fill=home_ornot, label = Patient1.meta$location)) + geom_boxplot(size=0.6, alpha=0.5)+ geom_quasirandom(size=0.6, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ geom_text_repel()+ #geom_beeswarm(size=0.6, alpha=0.3, color = "black", fill="black", cex=2, width=0.1)+ #beeswarm has issues with 0s stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) + theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ scale_color_manual(values = rev(c("#00AFBB", "#E7B800"))) + scale_fill_manual(values = rev(c("#00AFBB", "#E7B800")))+ theme(strip.background = element_blank()) + rremove("x.text") + labs(color = "Location", fill="Location") #plotting all p values genus.campusornot.df = genus.campusornot.sig[, as.character(c(genus.select, "campus_ornot"))] mediandiff= apply(genus.campusornot.df[, 1:100], 2, function(x){mean(x[genus.campusornot.df$campus_ornot == "Campus"]) - mean(x[genus.campusornot.df$campus_ornot != "Campus"])}) #mean difference, to know which one is higher genus.campusornot.result.sig$variable = factor(genus.campusornot.result.sig$variable, levels = c(genus.campusornot.result.sig$variable)) ggplot(cbind.data.frame(genus.campusornot.result.sig, mdiff = mediandiff), aes(x = variable, y=p.adj, color=mdiff)) + geom_point() + theme_pubr(base_size=16, x.text.angle = 60) + theme(axis.text.x = element_text(size = 6)) + scale_color_gradient2(low = muted("red"), mid = "white", high = muted("blue"), midpoint = 0, space = "Lab", na.value = "grey50", guide = "colourbar") ``` #Four people focus======================== ```{r four people PCA plotting} re = dudi.pca(t(four.master.asinh.cpm[-1, fourpeople.select$samplenames]), scale=FALSE, scannf = FALSE, nf=3) fviz_screeplot(re, addlabels = TRUE ) get_eig(re) #eigenvalues #flipping y if necessary re$li$Axis2 = -re$li$Axis2 re$c1$CS2 = -re$c1$CS2 re$co$Comp2 = -re$co$Comp2 re$l1$RS2 = -re$l1$RS2 gravity.df = cbind(re$li, fourpeople.select$ownership) fourpeople.select$ownership = mapvalues(fourpeople.select$ownership, c("Gw" , "No3_San Mateo", "Guan_SF", "Mike"), c("P5", "P3", "P6", "P1")) fourpeople.select$ownership = factor(fourpeople.select$ownership, levels = c("P1", "P3", "P5", "P6")) fviz_pca_ind(re, label="none", fill.ind = fourpeople.select$ownership, pointsize=3, pointshape=21, palette=c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"), alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.58) + theme(legend.position = "bottom")+ coord_fixed(sqrt(re$eig[2] / re$eig[1])) + theme_pubr(margin = FALSE, base_size = 16) + geom_text_repel(aes(label = fourpeople.select$batch)) fviz_pca_ind(re, label="none", axes = c(1,3), fill.ind = fourpeople.select$ownership, pointsize=3, pointshape=21, palette=c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"), alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.58) + theme(legend.position = "bottom")+ coord_fixed(sqrt(re$eig[2] / re$eig[1])) + theme_pubr(margin = FALSE, base_size = 16) #+ geom_text_repel(aes(label = fourpeople.select$location)) fviz_pca_ind(re, label="none", fill.ind = fourpeople.select$ownership, pointsize=3, pointshape=21, palette=c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"), alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.58) + theme(legend.position = "bottom")+ coord_fixed(sqrt(re$eig[2] / re$eig[1])) + theme_pubr(margin = FALSE, base_size = 16) + geom_text_repel(aes(label = fourpeople.select$location), size =4) fviz_pca_ind(re, label="none", fill.ind = fourpeople.select$ownership, pointsize=3, pointshape=21, palette=c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"), alpha.ind = 0.8, addEllipses =TRUE, ellipse.level = 0.58) + theme(legend.position = "bottom")+ coord_fixed(sqrt(re$eig[2] / re$eig[1])) + labs(fill = "Person") + theme_pubr(margin = FALSE, base_size = 16) #+ geom_text_repel(aes(label = fourpeople.select$location)) #loadings plot var <- get_pca_var(re) ind = get_pca_ind(re) head(var$contrib,15) #contributions #select top 15 contributions from axis1 and axis2 names = c(head(rownames(var$contrib)[order(var$contrib[,1], decreasing = TRUE)],15), head(rownames(var$contrib)[order(var$contrib[,2], decreasing = TRUE)],7),head(rownames(var$contrib)[order(var$contrib[,3], decreasing = TRUE)],7)) names =unique(names) #equivalent to contribution of variables names = c(head(rownames(re$c1)[order(re$c1$CS1^2, decreasing = TRUE)],10), head(rownames(re$c1)[order(re$c1$CS2^2, decreasing = TRUE)],10), head(rownames(re$c1)[order(re$c1$CS3^2, decreasing = TRUE)],10)) names = unique(names) #this shows the variable plot better than based on other metrics names = c(head(rownames(re$c1)[order(re$c1$CS1)],8),tail(rownames(re$c1)[order(re$c1$CS1)],8), head(rownames(re$c1)[order(re$c1$CS2)],4),tail(rownames(re$c1)[order(re$c1$CS2)],4)) names = unique(names) #select.var = list(contrib=30) will choose the top 30 contributing varibles automatically, it is however unclear how it was calcuated fviz_pca_var(re, select.var= list(name=names), geom=c("point", "text"),col.var="contrib", gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"), repel = TRUE, labelsize = 5) + #coord_fixed(sqrt(re$eig[2] / re$eig[1])) + theme_pubr(base_size = 16, legend = "top") #plot features(loadings) fviz_pca_var(re, select.var= list(name=names), geom=c("point","arrow", "text"), col.var="contrib", gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"), repel = TRUE, axes=c(1,3)) #plot features(loadings) #biplot prep #biplot.names = c(head(rownames(var$contrib)[order(var$contrib[,1], decreasing = TRUE)],10), #head(rownames(var$contrib)[order(var$contrib[,2], decreasing = TRUE)],10)) #biplot.names = unique(biplot.names) #same as above biplot.names = c(head(rownames(re$c1)[order(re$c1$CS1)],5),tail(rownames(re$c1)[order(re$c1$CS1)],5), head(rownames(re$c1)[order(re$c1$CS2)],3),tail(rownames(re$c1)[order(re$c1$CS2)],3), head(rownames(re$c1)[order(re$c1$CS3)],2),tail(rownames(re$c1)[order(re$c1$CS3)],2) ) ###special plot for locations fviz_pca_biplot(re, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = fourpeople.select$location, #col.ind = fourpeople.select$ownership, select.var= list(name = names), pointshape = 21, pointsize = 3, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.2 , repel = TRUE, col.var = "contrib", gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"), label.size = 4, addEllipses = TRUE, ellipse.type = "euclid", ellipse.level = 0.60, invisible = "quali", )+ labs(fill = "Locations", color = "Contribution", alpha = "Contribution") + theme(legend.position = "bottom")# Change legend title #+ scale_shape_manual(values=c(18,17,16,15)) #in development fviz_pca_biplot(re, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = fourpeople.select$ownership, #col.ind = fourpeople.select$ownership, select.var= list(name = names), pointshape = 21, pointsize = 3, palette = c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"), axes = c(1,3), #addEllipses = TRUE, # Variables alpha.var =0.2 , repel = TRUE, col.var = "contrib", gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"), label.size = 4, #addEllipses = TRUE, ellipse.type = "euclid", ellipse.level = 0.60 #invisible = "quali" )+ labs(fill = "Person", color = "Contribution", alpha = "Contribution") + theme_pubr(base_size = 14, margin = FALSE) # Change legend title ###other plots concerning these individuals, using the habillage arugment would cause issues for gradient colors fviz_pca_biplot(re, geom.ind =c("point"), geom.var=c("point", "text"), select.var= list(name = names), alpha.var ="contrib", col.var = "contrib", fill.ind = fourpeople.select$ownership, repel=TRUE, palette="npg",pointsize=2, pointshape=21, gradient.cols = "RdBu") + theme(legend.position = "bottom") #+ scale_shape_manual(values=c(18,17,16,15)) fviz_pca_biplot(re, label = "var", select.var= list(name = names),alpha.var ="contrib", habillage = fourpeople.select$ownership, repel=TRUE, palette="npg", addEllipses =TRUE, ellipse.level = 0.68) + scale_shape_manual(values=c(18,17,16,15)) + theme(legend.position = "bottom") + theme(legend.position = "bottom") fviz_pca_biplot(re, axes=c(1,3),label = "var", select.var= list(name = names), alpha.var ="contrib", habillage = fourpeople.select$ownership, repel=TRUE, palette="npg", pointsize = 2) + scale_shape_manual(values=c(18,17,16,15)) + theme(legend.position = "bottom") fviz_pca_biplot(re, axes=c(1,3),label = "var", select.var= list(name = names), alpha.var ="contrib", habillage = fourpeople.select$ownership, repel=TRUE, palette="npg", addEllipses =TRUE, ellipse.level = 0.68) + scale_shape_manual(values=c(18,17,16,15)) + theme(legend.position = "bottom") #fviz_pca_ind(re, label="none", pointsize=2.5, palette="npg", alpha.ind = 0.5, ) + scale_shape_manual(values=c(rep(19,5))) fviz_contrib(re, choice = "var", axes=1,top=20) fviz_contrib(re, choice = "var", axes=2, top=20) fviz_contrib(re, choice = "var", axes=3, top=20) #top contributing variables for each PC head(rownames(re$c1)[order(re$c1$CS1^2, decreasing = TRUE)],20) head(rownames(re$c1)[order(re$c1$CS2^2, decreasing = TRUE)],20) head(rownames(re$c1)[order(re$c1$CS3^2, decreasing = TRUE)],20) #this is for four people heatmap or barplot names2 = c(head(rownames(re$c1)[order(re$c1$CS1^2, decreasing = TRUE)],25), head(rownames(re$c1)[order(re$c1$CS2^2, decreasing = TRUE)],10), head(rownames(re$c1)[order(re$c1$CS3^2, decreasing = TRUE)],5)) names = unique(names) ``` ##Four people c-means ```{r fourpeople c-means clustering} #excluding Mike as he is too divergent three.select.master.asinh.cpm = master.asinh.cpm.single[, fourpeople.select$samplenames[fourpeople.select$aownership!="P1"]] threepeople.select = fourpeople.select[fourpeople.select$aownership!="P1",] threepeople_complete = cbind.data.frame(t(three.select.master.asinh.cpm[rowSums(three.select.master.asinh.cpm)>0,]), owner = threepeople.select$aownership) threepeople.owner = aggregate(. ~ owner, threepeople_complete, median) order= threepeople.owner$owner threepeople.owner = threepeople.owner[, -1] threepeople.owner = threepeople.owner[, colSums(threepeople.owner)>0] weight = c(apply(threepeople.owner, 2, var)) threepeople.owner = threepeople.owner[, weight>=0.2] #variance filter threepeople.scale = scale(threepeople.owner) rownames(threepeople.scale) = c(order) wtf = t(threepeople.scale) #p.test threepeople.melt = melt(threepeople_complete, id.vars = "owner") p.result = compare_means( formula = value ~ owner, data = threepeople.melt, group.by = "variable", method = "kruskal.test", p.adjust.method = "fdr") #determining the number of clusters # Elbow method fviz_nbclust(wtf, kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2)+ #this is hardcoded labs(subtitle = "Elbow method") # Silhouette method fviz_nbclust(wtf, kmeans, method = "silhouette")+ labs(subtitle = "Silhouette method") # Gap statistic # nboot = 50 to keep the function speedy. # recommended value: nboot= 500 for your analysis. # Use verbose = FALSE to hide computing progression. set.seed(123) fviz_nbclust(wtf, kmeans, nstart = 25, method = "gap_stat", nboot = 100)+ labs(subtitle = "Gap statistic method") cm = cmeans(wtf, center=8, iter.max=200) table(cm$cluster) cm$cluster = factor(cm$cluster, levels=c(1:length(unique(cm$cluster)))) fviz_cluster(list(data = wtf, cluster=cm$cluster), geom = c("text"), ellipse = TRUE, ellipse.alpha = 0.3, #used to be 0.6 if only points are plotted. ellipse.type = "norm", ellipse.level = 0.68, palette = "npg", repel = TRUE) + theme_pubr(base_size = 16) tempp = cbind.data.frame(wtf, cluster=cm$cluster, membership=apply(cm$membership, 1, max), taxon = row.names(wtf)) df.season.scale.group = tempp[tempp$membership>0.7,] table(df.season.scale.group$cluster) length(df.season.scale.group$cluster) df.season.melt = melt(df.season.scale.group, id.vars = c("cluster","membership", "taxon")) df.season.melt$variable = factor(df.season.melt$variable) df.season.melt$cluster = factor(df.season.melt$cluster) #+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) #green, blue, red, dark blue ggplot(data=df.season.melt, aes(x=variable, y=value, group=taxon, color=cluster, alpha=membership)) + #geom_line(stat="smooth", method="loess", alpha=0.7)+ geom_line(size=1.5)+ geom_hline(yintercept = 0, linetype = 2)+ theme_pubr(base_size = 16, x.text.angle = 45) + labs(y="Scaled Values") + guides(alpha=FALSE, size=FALSE) + scale_color_npg()+ facet_wrap(~cluster, scales="free_y", ncol=4) ``` ##Four people distance calculation ```{r fourpeople distance} #reference dataframe fourpeople.check = subset(fourpeople.select, select=c(ownership, samplenames, date.start, date.end, Filter_No)) #evaluating variance level for each owner res2 = rcorr(fourpeople.select.master.asinh.cpm[-1,], type="pearson") m = corrplot::corrplot(res2$r, order = "hclust", hclust.method = "average", p.mat = res2$P, sig.level = 0.005, insig = "blank", tl.cex=0.6, type="lower", method="pie") four.bray.dist = vegdist(t(fourpeople.select.master.asinh.cpm[-1, ]), method="bray") #can use as.matrix to access #in order to change label names for an dist object, one needs to convert it to matrix four.bray.dist.matrix = as.matrix(four.bray.dist) rownames(four.bray.dist.matrix) = fourpeople.check$ownership colnames(four.bray.dist.matrix) = fourpeople.check$ownership four.bray.dist.new = as.dist(four.bray.dist.matrix) fviz_dist(four.bray.dist.new) plot(hclust(four.bray.dist.new, method = "ward.D"), xlab="Samples") library(qgraph) dist_mi <- 1/four.bray.dist.new # one over, as qgraph takes similarity matrices as input #jpeg('example_forcedraw.jpg', width=1000, height=1000, unit='px') qgraph(dist_mi, layout='spring', esize=3, trans=0.5, alpha=0.05) #output tables for gephi nodes.table = data.frame(Id = rownames(four.bray.dist.matrix), Label=fourpeople.check$ownership, Class=fourpeople.check$ownership, Misc=fourpeople.check$Filter_No) write.csv(nodes.table, "./fourpeople.nodes.table.csv", row.names = FALSE, quote=FALSE) #outputing edges table fourpeoplecombos = combinations(rownames(four.bray.dist.matrix)) #matrix2edges function matrix2edges = function(matrix){ edges.table = tibble(Source = character(), Target = character(), Weight = numeric(), Type=character()) #generating all possible combinations given matrix's rownames!!! combos = combinations(rownames(matrix)) for (i in 1:dim(combos)[1]){ edges.table = bind_rows(edges.table, tibble(Source = combos$a[i], Target = combos$b[i], Weight = 1-matrix[combos$a[i], combos$b[i]], Type = "Undirected")) #Weight is similarity score } return(edges.table) } edges.table = matrix2edges(four.bray.dist.matrix) write.csv(edges.table, "./fourpeople.edges.table.csv", row.names= FALSE, quote=FALSE) #calculating distances between samples using first three PCs four.pca.dist = as.matrix(vegdist(re$li, method="euclidean")) euc.dist.vector = c() for (name in unique(fourpeople.select$ownership)){ print(name) temp.names = fourpeople.select$samplenames[fourpeople.select$ownership==name] temp.combos = as.data.frame(combinations(temp.names)) bray.all.dist = 0 euc.pca.dist = 0 #getting distances for (i in 1:dim(temp.combos)[1]){ #print(i) #print(as.matrix(four.bray.dist)[temp.combos[i,"a"], temp.combos[i,"b"]]) bray.all.dist = bray.all.dist + as.matrix(four.bray.dist)[temp.combos[i,"a"], temp.combos[i,"b"]] euc.pca.dist = euc.pca.dist + four.pca.dist[temp.combos[i,"a"], temp.combos[i,"b"]] } print(bray.all.dist/dim(temp.combos)[1]) print(euc.pca.dist/dim(temp.combos)[1]) #euc.dist.vector = c(euc.dist.vector, name=euc.pca.dist/dim(temp.combos)[1]) euc.dist.vector[name] = euc.pca.dist/dim(temp.combos)[1] #print(temp.names) } euc.dist.vector = sort(euc.dist.vector, decreasing = TRUE) activity.dist.vector = c(470, 50, 5, 1) activity.df = data.frame(euc.dist = euc.dist.vector, actual.dist = activity.dist.vector, Ownership=c("P1", "P3", "P5", "P6")) plot(euc.dist.vector, log(activity.dist.vector)) #elegant graphic output my.formula = y ~ x lmfit = lm(euc.dist ~ actual.dist, data=activity.df) ggplot(activity.df, aes(x=actual.dist, y= euc.dist, color=Ownership)) + geom_point(size=4, alpha=0.8) + geom_smooth(aes(group=1),method = "glm", formula = my.formula, show.legend = TRUE) + # labs(x="Activity region", y="Profile dissimilarities")+ stat_poly_eq(formula = my.formula, #eq.with.lhs = "italic(y)~`=`~", aes(group=1,label = paste(..eq.label.., ..rr.label.., sep = "~~~")), parse = TRUE) + scale_color_manual(values=c("#00A087FF","#E64B35FF","#4DBBD5FF","#3C5488FF"))+ labs(x="Average geographic distance among samples", y="Average euclidean distance among samples")+ theme_pubr(base_size = 14) #plot(hclust(four.bray.dist, method = "ward.D2"), labels = fourpeople.select$ownership) #plot(hclust(four.bray.dist, method = "centroid"), labels = fourpeople.select$ownership) ``` ##Four people species names for exposome network, this is the old setup, using new interactions database now ```{r fourpeople species names} #+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) getname = function(profile){ temp.df = master.cpm.single[, profile$samplenames] names = row.names(temp.df)[rowSums(temp.df)>10] print(head(rowSums(temp.df[names,]), 20)) return(names) } mike.names = getname(fourpeople.select[fourpeople.select$ownership == "Mike",]) gw.names = getname(fourpeople.select[fourpeople.select$ownership == "Gw",]) guan.names = getname(fourpeople.select[fourpeople.select$ownership == "Guan_SF",]) no3.names = getname(fourpeople.select[fourpeople.select$ownership == "No3_San Mateo",]) fournames = list(mike.names = mike.names, gw.names=gw.names, guan.names=guan.names, no3.names =no3.names) for (i in 1:length(fournames)){ print(str(fournames[[i]])) towrite = c("Homo sapiens", fournames[[i]]) write.table(towrite, paste0("~/Documents/Bioinfo/species-location-interaction_data/", i, "-names.txt"), quote = FALSE, row.names = FALSE, col.names = FALSE) } ``` ##Four people within individual vs others distance calculations. ```{r fourpeople distance one vs all} fourpeoplespecial = function(profile, text.size=5, pick=1){ #profile = fourpeople.select profile = data.frame(profile) master.cpm.snap = master.cpm.single[, profile$samplenames] bray.dist = vegdist(t(asinh(master.cpm.snap)), na.rm = TRUE, method= "bray") bray.dist.matrix = as.matrix(bray.dist) rownames(bray.dist.matrix) = profile$samplenames colnames(bray.dist.matrix) = profile$samplenames same.dist = c() different.dist = c() #same group vs different group implementation for (i in 1:(nrow(profile)-1)){ for (j in (i+1):nrow(profile)){ if (profile$aownership[i] == profile$aownership[j]){ #if (profile$location[i] == profile$location[i+1]){ same.dist = c(same.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } else if (profile$aownership[i] != profile$aownership[j]){ different.dist = c(different.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } else {} } } #one vs all implementation owners = c(unique(profile$aownership)) ownerpick = owners[pick] print(ownerpick) onegroup.dist = c() others.dist = c() control = 0 for (i in 1:(nrow(profile)-1)){ for (j in (i+1):nrow(profile)){ control = control + 1 if (profile$aownership[i] == profile$aownership[j] & profile$aownership[i] == ownerpick ){ onegroup.dist = c(onegroup.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } else if (profile$aownership[i] != profile$aownership[j] & (profile$aownership[i] == ownerpick | profile$aownership[j] == ownerpick)){ others.dist = c(others.dist, bray.dist.matrix[profile$samplenames[i], profile$samplenames[j]]) } } } #print(control) return(list(same.dist = same.dist, different.dist = different.dist, samedifferent.wilcox=wilcox.test(same.dist, different.dist), onegroup.dist = onegroup.dist, others.dist = others.dist, ownerpick = wilcox.test(onegroup.dist, others.dist) )) } fourpeople.select.nomike = fourpeople.select[fourpeople.select$aownership!="P1",] fourpeople.result = fourpeoplespecial(fourpeople.select, pick=1) fourpeople.result = fourpeoplespecial(fourpeople.select, pick=2) fourpeople.result = fourpeoplespecial(fourpeople.select, pick=3) fourpeople.result = fourpeoplespecial(fourpeople.select, pick=4) fourpeople.result = fourpeoplespecial(fourpeople.select.nomike, pick=1) fourpeople.result = fourpeoplespecial(fourpeople.select.nomike, pick=2) fourpeople.result = fourpeoplespecial(fourpeople.select.nomike, pick=3) dist.fourpeople.df = data.frame(distance = as.numeric(c(fourpeople.result$same.dist, fourpeople.result$different.dist)), group = c(rep("Same individual", length(fourpeople.result$same.dist)), rep("Different individual", length(fourpeople.result$different.dist)))) ggdensity(dist.fourpeople.df, x = "distance", add = "median", #add.params=list(color="red"), rug = TRUE, color = "group", fill = "group", palette = c("#00AFBB", "#E7B800"), ) + labs(x="Bray-Curtis Distance") + theme_pubr(base_size = 16) #ggsave("~/Documents/Bioinfo/DNAformal/plots/location.pdf", width=6, height=4) ``` #tSNE clustering===============================tSNE clustering ```{r tSNE clustering} library(Rtsne) #similar to PCA, the dataframe needs to be transposed. Columns should be traits, rows should be samples #even with set seed, the reproduciblity seems to be an issue here #will need to save the object for tsne for publication set.seed(42) distinct.colors = c("#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26") #testing perplexity for (k in seq(30,50,10)){ set.seed(42123) k=30 tsne = Rtsne(t(master.cpm.single), dims=2, perplexity=k, verbose=TRUE, max_iter =5000, initial_dims=100, theta=0.4) #tsne = Rtsne(t(master.asinh.cpm.single), dims=2, perplexity=k, verbose=TRUE, max_iter =5000, initial_dims=100, theta=0.4) #tsne = Rtsne(master.chisquare.cpm.single, dims=2, perplexity=k, verbose=FALSE, max_iter =5000) test.df = data.frame(tsne$Y) if (FALSE){ if (dim(master.cpm.single)[1] < 50){ test.df = data.frame(test.df, t(master.cpm.single[1:dim(master.cpm.single)[1], ]), scores_single_complete[, 4:length(names(scores_single_complete)) ]) } else { test.df = data.frame(test.df, t(master.cpm.single[1:50,]), scores_single_complete[, 4:length(names(scores_single_complete))]) } } testplot = ggplot(test.df, aes(x = X1,y = X2)) testplot + geom_point(alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="#F38400") print(testplot) # + geom_text(nudge_y=0.2, size=2.5) # ggsave(paste0("./pdfs/",taxonlevel, "-", k , "perplexity plot-all.pdf")) #saveRDS(test.df, file = paste0("./data/DNA-", taxonlevel, "-", k, ".rds")) } #tSNE in 3d if (FALSE){ tsne.3d = Rtsne(master.chisquare.cpm, dims=3, perplexity=k, verbos=TRUE, max_iter =5000) test.df.3d = data.frame(tsne.3d$Y, Bacteria=master.cpm.single["Bacteria",], Fungi=master.cpm.single["Fungi",], Viridiplantae=master.cpm.single["Viridiplantae",], Metazoa=master.cpm.single["Metazoa",], Viruses=master.cpm.single["Viruses",], Archaea = master.cpm.single["Archaea",], scores_single_complete[, 4:length(names(scores_single_complete))]) colnames(test.df.3d)[1:3]=c("x",'y','z') p = plot_ly(test.df.3d, x = ~x, y = ~y, z = ~z, type ="scatter3d", mode ="markers", color = ~Metazoa) p test.df.melt = melt(test.df, id.vars=c("X1", "X2", colnames(test.df)[46:61]), measure.vars = c(colnames(test.df)[3:15])) testplot = ggplot(test.df.melt, aes(x=X1, y=X2, color=value, label=weekend)) testplot + geom_point()+ geom_text_repel()+ facet_wrap(~ variable, ) #for details at superkingdom level if (taxonlevel == "SUPERKINGDOM"){ test.df = data.frame(test.df, Bacteria=master.cpm.single["Bacteria",], Fungi=master.cpm.single["Fungi",], Viridiplantae=master.cpm.single["Viridiplantae",], Metazoa=master.cpm.single["Metazoa",], Viruses=master.cpm.single["Viruses",], Archaea = master.cpm.single["Archaea",], scores_single_complete[, 4:length(names(scores_single_complete))]) testplot = ggplot(test.df, aes(x = X1,y = X2, label=location)) testplot + geom_point(size =4, alpha=0.75) + ggtitle(paste0("DNA-", taxonlevel, "-perplexity=",k)) + theme_gray(base_size=15) testplot = ggplot(test.df, aes(x = X1,y = X2, color=ownership)) testplot + geom_point(size = 4, alpha=0.75) + ggtitle(paste0("DNA-", taxonlevel, "-perplexity=",k)) + theme_gray(base_size=15) testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Viridiplantae)) testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="dark green") + theme_gray(base_size=15) testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Bacteria)) testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="red") + theme_gray(base_size=15) testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Fungi)) testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="blue") + theme_gray(base_size=15) testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Metazoa)) testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="purple") + theme_gray(base_size=15) testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Viruses)) testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="brown") + theme_gray(base_size=15) testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=Archaea, label=filter_list)) testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="orange") + theme_gray(base_size=15) #+ geom_text_repel(aes(label=filter_list),size=2.5)+ theme_gray(base_size=15) testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=season)) testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + theme_classic(base_size=15) #+ geom_text_repel(aes(label = location), size=2.5)+ theme_gray(base_size=15) } testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=drywet)) testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + theme_gray(base_size=15) testplot = ggplot(test.df, aes(x = X1,y = X2, label=location, color=humid)) testplot + geom_point(size = 4,alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_continuous(low="white", high="blue") + theme_gray(base_size=15) test.df.two = test.df[grep("Mike|No1_1636", test.df$ownership),] testplot = ggplot(test.df.two, aes(x = X1,y = X2, label=location, color=ownership)) testplot + geom_point(size =4, alpha=0.75) + ggtitle(paste0("Mike and patient1, perplexity=",k)) + theme_gray(base_size=15)#+ scale_color_manual(values=c( "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26")) environ.sp = environ[match(test.df$filter_list, environ$Filter_No), ] if (FALSE){ #for other projects for (k in seq(10,50,5)){ print(k) set.seed(42) tsne = Rtsne(t(mouse.table[,-1]), dims=2, perplexity=k, verbose=TRUE, max_iter =5000) test.df = data.frame(tsne$Y, sample= names(mouse.table[-1])) testplot = ggplot(test.df, aes(x = X1,y = X2, label=sample)) testplot + geom_point(alpha=0.75) + ggtitle(paste0("perplexity=",k)) + #scale_color_manual(values=c("#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", #"#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26")) + geom_text(nudge_y=0.2, size=2.5) ggsave(paste0("./pdfs/mouse", "-", k , "perplexity plot-all.pdf")) } } k=25 set.seed(42) tsne = Rtsne(t(Mike.master.cpm), dims=2, perplexity=k, verbose=TRUE, max_iter =5000) test.df = data.frame(tsne$Y) test.df = data.frame(test.df, Mike_single) testplot = ggplot(test.df, aes(x = X1,y = X2, color=season, label=location)) testplot + geom_point(alpha=0.75) + ggtitle(paste0("perplexity=",k)) + scale_color_manual(values=c("#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26")) + geom_text(nudge_y=0.3, size=2.5) ggsave(paste0("./pdfs/",taxonlevel, "-", k , "perplexity plot-mike.pdf")) # + geom_text(nudge_y=0.4, size=2.5) # + geom_text(nudge_y=0.4, size=2.5) } ``` #Cmeans clustering-season ```{r cmeans clustering} library(cluster) library(factoextra) set.seed(42) master.asinh.cpm.season = cbind.data.frame(t(master.asinh.cpm.single), season=scores_single_complete$season) df.season = aggregate(. ~ season, master.asinh.cpm.season, mean) df.season = df.season[, -1] df.season = df.season[, colSums(df.season)>0] season.var = apply(df.season,2,var) var.thre = quantile(season.var, 0.5) #df.season = df.season[, season.var >= 0.05] #variance filter df.season.sel = df.season[, season.var >= var.thre] weight = c(apply(df.season.sel, 2, var)) df.season.scale = scale(df.season.sel) rownames(df.season.scale) = c("spring","summer","fall","winter") wtf = t(df.season.scale) #determining the number of clusters # Elbow method fviz_nbclust(wtf, kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2)+ #this is hardcoded labs(subtitle = "Elbow method") # Silhouette method fviz_nbclust(wtf, kmeans, method = "silhouette")+ labs(subtitle = "Silhouette method") # Gap statistic # nboot = 50 to keep the function speedy. # recommended value: nboot= 500 for your analysis. # Use verbose = FALSE to hide computing progression. set.seed(123) fviz_nbclust(wtf, kmeans, nstart = 25, method = "gap_stat", nboot = 100)+ labs(subtitle = "Gap statistic method") set.seed(123) cm = cmeans(wtf, center=4, iter.max=500) #cm$cluster = factor(cm$cluster, levels=c(1,2,3,4)) table(cm$cluster) fviz_cluster(list(data = wtf, cluster=cm$cluster), geom = c("point"), ellipse = TRUE, ellipse.alpha = 0.3, #used to be 0.6 if only points are plotted. ellipse.type = "norm", ellipse.level = 0.68, palette = "npg", repel = TRUE) + theme_pubr(base_size = 14) + scale_color_manual(values=c("3" = "#00A087FF","2" = "#4DBBD5FF","4" = "#E64B35FF","1"= "#3C5488FF")) + scale_fill_manual(values=c("3" = "#00A087FF","2" = "#4DBBD5FF","4" = "#E64B35FF","1"= "#3C5488FF")) tempp = cbind.data.frame(wtf, weight=weight, cluster=cm$cluster, membership=apply(cm$membership, 1, max), taxon = row.names(wtf)) df.season.scale.group = tempp[tempp$membership>=0.65,] table(df.season.scale.group$cluster) df.season.melt = melt(df.season.scale.group, id.vars = c("cluster","membership", "taxon","weight"), variable.name = "season") #df.season.scale.group = df.season.scale[, cm$cluster==3 & cm$membership[,3]>=0.7] #df.season.melt = melt(df.season.scale.group) #colnames(df.season.melt) = c("season", "taxon", "value") df.season.melt$season = factor(df.season.melt$season, levels=c("spring", "summer", "fall", "winter")) df.season.melt$cluster = factor(df.season.melt$cluster) #+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) #green, blue, red, dark blue ggplot(data=df.season.melt, aes(x=season, y=value, group=taxon, color=cluster, size=weight, alpha=membership)) + geom_line()+ theme_pubr(base_size = 20) + scale_color_manual(values=c("3" = "#00A087FF","2" = "#4DBBD5FF","4" = "#E64B35FF","1"= "#3C5488FF")) + scale_x_discrete(expand=c(0,0)) #expand c(0,0) eliminates the sapcing #season p test, p.adj were calculated master_combined_season = cbind.data.frame(t(master.asinh.cpm.single[rowSums(master.asinh.cpm.single)>0,]), season = scores_single_complete$season) master_season.melt =melt(master_combined_season, id.vars = "season") p.result.season = compare_means( formula = value ~ season, data = master_season.melt, group.by = "variable", method = "kruskal.test", p.adjust.method = "fdr") ``` ##Cmeans clustering-month ```{r cmeans clustering-month} #not working great library(cluster) library(factoextra) set.seed(42) master.asinh.cpm.month = cbind.data.frame(t(master.asinh.cpm.single[,]), date.month=scores_single_complete$date.month) df.season = aggregate(. ~ date.month, master.asinh.cpm.month, mean) df.season = df.season[, -1] df.season = df.season[, colSums(df.season)>0] df.season = df.season[, apply(df.season,2,var)>=0.05] #variance filter weight = c(apply(df.season, 2, var)) df.season.scale = scale(df.season) rownames(df.season.scale) = c(1:12) wtf = t(df.season.scale) #determining the number of clusters # Elbow method fviz_nbclust(wtf, kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2)+ #this is hardcoded labs(subtitle = "Elbow method") # Silhouette method fviz_nbclust(wtf, kmeans, method = "silhouette")+ labs(subtitle = "Silhouette method") # Gap statistic # nboot = 50 to keep the function speedy. # recommended value: nboot= 500 for your analysis. # Use verbose = FALSE to hide computing progression. set.seed(123) fviz_nbclust(wtf, kmeans, nstart = 25, method = "gap_stat", nboot = 100)+ labs(subtitle = "Gap statistic method") cm = cmeans(wtf, center=2, iter.max=200) cm$cluster = factor(cm$cluster, levels=c(1:8)) fviz_cluster(list(data = wtf, cluster=cm$cluster), geom = c("point"), ellipse = TRUE, ellipse.alpha = 0.6, #used to be 0.6 if only points are plotted. ellipse.type = "norm", ellipse.level = 0.68, palette = "npg", repel = TRUE) + theme_pubr(base_size = 14) tempp = cbind.data.frame(wtf, weight=weight, cluster=cm$cluster, membership=apply(cm$membership, 1, max), taxon = row.names(wtf)) df.season.scale.group = tempp[tempp$membership>0.5 ,] df.season.melt = melt(df.season.scale.group, id.vars = c("cluster","membership", "taxon","weight"), variable.name = "season") #df.season.scale.group = df.season.scale[, cm$cluster==3 & cm$membership[,3]>=0.7] #df.season.melt = melt(df.season.scale.group) #colnames(df.season.melt) = c("season", "taxon", "value") df.season.melt$season = factor(df.season.melt$season, levels=c(1:12)) df.season.melt$cluster = factor(df.season.melt$cluster) #+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) #green, blue, red, dark blue ggplot(data=df.season.melt, aes(x=season, y=value, group=taxon, color=cluster, size=weight)) + geom_line(alpha=0.5)+ theme_pubr(base_size = 14) + scale_color_npg() ``` #seasonal p test ```{r season p test} #season p test on genus using all samples and P1's samples, p.adj were calculated #all samples master.asinh.cpm.single.fifty = master.asinh.cpm.single[rowSums(master.asinh.cpm.single > 0) >= 50,] master_combined_season = cbind.data.frame(t(master.asinh.cpm.single.fifty), season = scores_single_complete$season) master_season.melt =melt(master_combined_season, id.vars = "season") p.result.season = compare_means( formula = value ~ season, data = master_season.melt, group.by = "variable", method = "kruskal.test", p.adjust.method = "fdr") rownames(p.result.season) = p.result.season$variable #the 9 phyla from figure season.phyla.select = c("Basidiomycota","Ascomycota","Streptophyta","Chordata","Acidobacteria", "Firmicutes", "Proteobacteria","Actinobacteria", "Bacteroidetes") p.season.select.df = master_combined_season[, c(as.character(season.phyla.select), "season")] p.season.select.df.melt = melt(p.season.select.df, id.vars = "season") #ranking genera by adjusted pvalues comp = function(master_combined, compare_result){ master_combined.sig= master_combined[, c(compare_result$p.adj < 0.05, TRUE)] compare_result.sig = compare_result[compare_result$p.adj < 0.05,] compare_result.sig$variable = as.character(compare_result.sig$variable) compare_result.sig = compare_result.sig[order(compare_result.sig$p.adj),] #rank names by inverse order of p.adjust, first one is the smallest genus.select = as.character(compare_result.sig$variable) master_combined.sig.melt = melt(master_combined.sig[, as.character(c(genus.select, "season"))], id.vars = "season") return(list(df = master_combined.sig, df.melt = master_combined.sig.melt, compare = compare_result.sig, genera = genus.select)) } p.season.results = comp(master_combined_season, p.result.season) #all samples, all significant taxa (use 90 x 20 to start for saving pdfs) ggplot(p.season.results$df.melt, aes(x = season , y = value, color=season, fill=season)) + geom_boxplot(size=0.6, alpha=0.5)+ geom_quasirandom(size=0.6, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ stat_summary(fun.y=median, geom="line", aes(group=1), color = "black", alpha=0.6) + stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.5, show.legend = FALSE) + theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) + scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))+ theme(strip.background = element_blank()) + rremove("x.text") + labs(color = "Season", fill="Season") + facet_wrap(~variable, scales="free_y", ncol=5) #expa #select phyla, all samples ggplot(p.season.select.df.melt, aes(x = season , y = value, color=season, fill=season)) + geom_boxplot(size=0.6, alpha=0.5)+ geom_quasirandom(size=0.4, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ stat_summary(fun.y=median, geom="line", aes(group=1), color = "black", alpha=0.6) + stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) + theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) + scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))+ theme(strip.background = element_blank()) + rremove("x.text") + labs(color = "Season", fill="Season") + facet_wrap(~variable, scales="free_y", ncol=3) #P1 samples, all significant taxa p1_combined_season = cbind.data.frame(t(master.asinh.cpm.single.fifty[,Mike_single$samplenames]), season = Mike_single$season) p1_combined_season.melt =melt(p1_combined_season, id.vars = "season") p1.result.season = compare_means( formula = value ~ season, data = p1_combined_season.melt, group.by = "variable", method = "kruskal.test", p.adjust.method = "fdr") p1.season.results = comp(p1_combined_season, p1.result.season) #this is a list p1.season.select.df = p1_combined_season[, c(as.character(season.phyla.select), "season")] p1.season.select.df.melt = melt(p1.season.select.df, id.vars = "season") ggplot(p1.season.results$df.melt, aes(x = season , y = value, color=season, fill=season)) + geom_boxplot(size=0.6, alpha=0.5)+ geom_quasirandom(size=0.6, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ stat_summary(fun.y=median, geom="line", aes(group=1), color = "black", alpha=0.6) + stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) + theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) + scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))+ theme(strip.background = element_blank()) + rremove("x.text") + labs(color = "Season", fill="Season") + facet_wrap(~variable, scales="free_y", ncol=5) #p1 samples, selected phyla ggplot(p1.season.select.df.melt, aes(x = season , y = value, color=season, fill=season)) + geom_boxplot(size=0.6, alpha=0.5)+ geom_quasirandom(size=0.4, alpha=0.3, color = "black", fill="black", width=0.15, bandwidth= 0.2, method = "quasirandom")+ stat_summary(fun.y=median, geom="line", aes(group=1), color = "black", alpha=0.6) + stat_compare_means(aes(label = ..p.signif..), label.x.npc = 0.4, show.legend = FALSE) + theme_pubr(base_size = 16, x.text.angle = 45, margin = FALSE)+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) + scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))+ theme(strip.background = element_blank()) + rremove("x.text") + labs(color = "Season", fill="Season") + facet_wrap(~variable, scales="free_y", ncol=3) #p2 samples, all phyla p2_combined_season = cbind.data.frame(t(master.asinh.cpm.single.fifty[,Patient1$samplenames]), season = Patient1$season) p2_combined_season.melt =melt(p2_combined_season, id.vars = "season") p2.result.season = compare_means( formula = value ~ season, data = p2_combined_season.melt, group.by = "variable", method = "kruskal.test", p.adjust.method = "fdr") p2.season.results = comp(p2_combined_season, p2.result.season) #this is a list, will produce an error if no genus has a adj.p < 0.05, P2 is such a case #plot all pvalues for all samples p.season.results$compare$variable = factor(p.season.results$compare$variable, levels = c(p.season.results$compare$variable)) ggplot(p.season.results$compare, aes(x = variable, y=p.adj, color=p.adj)) + geom_point() + theme_pubr(base_size=10, x.text.angle = 60) + theme(axis.text.x = element_text(size = 6)) chisq.test(x=rbind(c(79/2,79/2),c(61,18))) ``` #Spatial variables construct ```{r spatial variables MEM construct} library(adespatial) library(spacemakeR) #some missing values in original coordinates data, run imputation first from the RDA section expo.xy = subset(rda.env, select = c(latitude, longitude)) expo.xy.name = cbind(expo.xy, name = row.names(expo.xy)) colnames(expo.xy.name) = c("lat", "lon", "name") #taking coordinates and calculate geographic distance between sites, unit is in m ReplaceLowerOrUpperTriangle <- function(m, triangle.to.replace){ # If triangle.to.replace="lower", replaces the lower triangle of a square matrix with its upper triangle. # If triangle.to.replace="upper", replaces the upper triangle of a square matrix with its lower triangle. if (nrow(m) != ncol(m)) stop("Supplied matrix must be square.") if (tolower(triangle.to.replace) == "lower") tri <- lower.tri(m) else if (tolower(triangle.to.replace) == "upper") tri <- upper.tri(m) else stop("triangle.to.replace must be set to 'lower' or 'upper'.") m[tri] <- t(m)[tri] return(m) } GeoDistanceInMetresMatrix <- function(df.geopoints){ # Returns a matrix (M) of distances between geographic points. # M[i,j] = M[j,i] = Distance between (df.geopoints$lat[i], df.geopoints$lon[i]) and # (df.geopoints$lat[j], df.geopoints$lon[j]). # The row and column names are given by df.geopoints$name. GeoDistanceInMetres <- function(g1, g2){ # Returns a vector of distances. (But if g1$index > g2$index, returns zero.) # The 1st value in the returned vector is the distance between g1[[1]] and g2[[1]]. # The 2nd value in the returned vector is the distance between g1[[2]] and g2[[2]]. Etc. # Each g1[[x]] or g2[[x]] must be a list with named elements "index", "lat" and "lon". # E.g. g1 <- list(list("index"=1, "lat"=12.1, "lon"=10.1), list("index"=3, "lat"=12.1, "lon"=13.2)) DistM <- function(g1, g2){ require("Imap") return(ifelse(g1$index > g2$index, 0, gdist(lat.1=g1$lat, lon.1=g1$lon, lat.2=g2$lat, lon.2=g2$lon, units="m"))) } return(mapply(DistM, g1, g2)) } n.geopoints <- nrow(df.geopoints) # The index column is used to ensure we only do calculations for the upper triangle of points df.geopoints$index <- 1:n.geopoints # Create a list of lists list.geopoints <- by(df.geopoints[,c("index", "lat", "lon")], 1:n.geopoints, function(x){return(list(x))}) # Get a matrix of distances (in metres) mat.distances <- ReplaceLowerOrUpperTriangle(outer(list.geopoints, list.geopoints, GeoDistanceInMetres), "lower") # Set the row and column names rownames(mat.distances) <- df.geopoints$name colnames(mat.distances) <- df.geopoints$name return(mat.distances) } expo.geo.xy = round(GeoDistanceInMetresMatrix(expo.xy.name) / 1000) #distance in km expo.pcnm2 = vegan::pcnm(expo.geo.xy) expo.vario = variogmultiv(Y = rda.data, xy = expo.xy, nclass =20 ) plot(expo.vario$d, expo.vario$var, ty='b', pch=20, xlab="Distance", ylab="C(distance)") #210 is the maxmium #first peak is expo.pcnm = vegan::pcnm(dist(expo.xy)) min.d = expo.pcnm$threshold #minimum distance thresh10 = seq(give.thresh(dist(expo.xy)), give.thresh(dist(expo.xy))*4, le=10) list10nb = lapply(thresh10, dnearneigh, x=as.matrix(expo.xy), d1=0) f2 = function(D, dmax, y) {1- (D/dmax)^y} expo.thresh.f2 = lapply(list10nb, function(x) test.W(x, Y=rda.data, f=f2, y=2:10, dmax=max(unlist(nbdists(x, as.matrix(expo.xy)))), xy=as.matrix(expo.xy))) expo.f2.minAIC = sapply(expo.thresh.f2, function(x) min(x$best$AICc, na.rm=TRUE)) min(expo.f2.minAIC) nb.bestmod = which.min(expo.f2.minAIC) #6 variables dmax.best = expo.thresh.f2[nb.bestmod][[1]]$all[1,2] expo.MEM.champ = unlist(expo.thresh.f2[which.min(expo.f2.minAIC)], recursive = FALSE) summary(expo.MEM.champ) expo.MEM.champ$best$values expo.MEM.champ$best$ord MEMid = expo.MEM.champ$best$ord[1:which.min(expo.MEM.champ$best$AICc)] sort(MEMid) expo.MEM.champ$best$R2 MEM.select = expo.MEM.champ$best$vectors[, sort(c(MEMid))] colnames(MEM.select) = sort(MEMid) #best R2 R2.MEMbest = expo.MEM.champ$best$R2[which.min(expo.MEM.champ$best$AICc)] #adjusted best R2 RsquareAdj(R2.MEMbest, nrow(rda.data), length(MEMid)) par(mfrow=c(2,4)) for(i in 1:ncol(MEM.select)){ #visualizing selected MEM factors s.value(expo.xy, MEM.select[,i], sub=sort(MEMid)[i], csub=2) } expo.MEM.rda = rda(rda.data ~., as.data.frame(MEM.select)) expo.MEM.R2a = RsquareAdj(expo.MEM.rda)$adj.r.squared anova(expo.MEM.rda) axes.MEM.test = anova(expo.MEM.rda, by="axis") nb.ax = length(which(axes.MEM.test[,5] <= 0.05)) #plot maps of the significant canonocial axes expo.MEM.axes = scores(expo.MEM.rda, choices=c(1,2), display="lc", scaling=1) par(mfrow = c(1,2)) s.value(expo.xy, expo.MEM.axes[,1]) s.value(expo.xy, expo.MEM.axes[,2]) #visualizing top two axes (tested by axes.MEM.test) for (i in 1:2){ print(ggplot(expo.xy, aes(y=latitude, x=longitude, size = expo.MEM.axes[,i])) + geom_point(shape=21, alpha=0.6, fill="steel blue") + labs(size = paste0("RDA.MEM.axis ", i))) } #visualizing MEM.select using ggplot for (i in 1:ncol(MEM.select)){ print(ggplot(expo.xy, aes(y=latitude, x=longitude, size = MEM.select[,i])) + geom_point(shape=21, alpha=0.6, fill="steel blue") + labs(size = paste0("MEM variable ", i))) } #testing location~season randomness library(nnet) locaseason = cbind(as.data.frame(plot.MEM.select), season = scores_single_complete$season) test <- multinom(season ~ ., data = locaseason) summary(test) ``` #Time variables construction AEM ```{r AEM construction of time-series} library(PCNM) library(AEM) #adjusting date.mid so samples don't have the exact same date.mid, this is for AEM variables construction AEM.in = scores_single_complete$date.mid for (i in 1:282){ print(AEM.in[i+1]-AEM.in[i]) delta = AEM.in[i+1]-AEM.in[i] if (delta == 0){ AEM.in[i+1] = AEM.in[i+1] + 0.5 } if (delta <0){ AEM.in[i+1] = AEM.in[i+1] + 1 } } #choosing the max.d for weights calculating in non-equal-distanced sampling points #mantel correlation is the multivariate version of pearson correlation correlog.expo <- mantel.correlog(dist(t(master.asinh.cpm.single)), XY=AEM.in, mult = "fdr") #this can be used for spatial as well #correlog.expo <- mantel.correlog(dist(t(Mike.master.asinh.cpm)), XY=Mike_single$date.start, mult = "fdr") #the n.class has some impact on the class.index, but not much plot(correlog.expo) #looks like 131 is the way to go, smallest non-significant index value weights.expo = weight.time(AEM.in, max.d = 131) # each unit is one day 1 = 1 day aem.expo.out = aem.time(length(AEM.in), w = weights.expo, moran=TRUE, plot.moran=TRUE) aem.expo.out$Moran #select significant positive variables, not forward selected aem.expo.out.pos = aem.expo.out$aem[,aem.expo.out$Moran$p.value < 0.05 & aem.expo.out$Moran$Positive] rda.data = t(master.asinh.cpm.single) aem.expo.rda.pos <- rda(rda.data ~ ., as.data.frame(aem.expo.out.pos)) #the second arugment needs to be data frame in formula mode anova(aem.expo.rda.pos) anova(aem.expo.rda.pos, by="axis") RsquareAdj(aem.expo.rda.pos) plot(AEM.in, scores(aem.expo.rda.pos, display="lc", choice=3), type="b", pch=19, main = paste("RDA axis", i, ", positive temporal correlation model"), xlab="Date", ylab="RDA axis") #select significant negative variables, not forward selected aem.expo.out.neg = aem.expo.out$aem[,aem.expo.out$Moran$p.value < 0.05 & !aem.expo.out$Moran$Positive] aem.expo.rda.neg <- rda(t(master.asinh.cpm.single), aem.expo.out.neg) anova(aem.expo.rda.neg) RsquareAdj(aem.expo.rda.neg) aem.expo.out.select = aem.expo.out$aem[,aem.expo.out$Moran$p.value < 0.05] #both positive and negative #this step takes a lot of time, so the result was saved #expo.aem.selectvar <- forward.sel(t(master.asinh.cpm.single), aem.expo.out.select, nperm=999, alpha=0.05) expo.aem.selectvar #saveRDS(object = expo.aem.selectvar, file="~/Documents/Bioinfo/RDS/expo.aem.selectvar.rds") expo.aem.selectvar = readRDS("~/Documents/Bioinfo/RDS/expo.aem.selectvar.rds") aem.expo.rda.fp <- rda(t(master.asinh.cpm.single) ~., data.frame(aem.expo.out.select[, expo.aem.selectvar$order])) #only the forward-picked ones RsquareAdj(aem.expo.rda.fp) aem.expo.rda.anova = anova(aem.expo.rda.fp) #this step also takes alot of time, and the first two axes are significant, 8% and 2%, respectively #aem.expo.rda.anova.byaxis = anova(aem.expo.rda.fp, by="axis") #saveRDS(object = aem.expo.rda.anova.byaxis, file = "~/Documents/Bioinfo/RDS/aem.expo.rda.byaxis.rds") aem.expo.rda.anova.byaxis = readRDS("~/Documents/Bioinfo/RDS/aem.expo.rda.byaxis.rds") #only first two axes are significant RsquareAdj(aem.expo.rda.fp) #only the first two axes are significant, axis 2 looks cyclic, axis 1 is very noisy, axis 3 looks interesting plot(AEM.in, scores(aem.expo.rda.fp, display="lc", choice=1), type="b", pch=19, main = paste("RDA axis", 1, ", positive temporal correlation model"), xlab="Date", ylab="RDA axis") plot(AEM.in, scores(aem.expo.rda.fp, display="lc", choice=2), type="b", pch=19, main = paste("RDA axis", 2, ", positive temporal correlation model"), xlab="Date", ylab="RDA axis") plot(AEM.in, scores(aem.expo.rda.fp, display="lc", choice=3), type="b", pch=19, main = paste("RDA axis", 2, ", positive temporal correlation model"), xlab="Date", ylab="RDA axis") ggplot(data.frame(time = AEM.in, score = c(scores(aem.expo.rda.fp, display="lc", choice=1))), aes(x=time, y=score))+ geom_point() + geom_line() + theme_pubr() ggplot(data.frame(time = AEM.in, score = c(scores(aem.expo.rda.fp, display="lc", choice=2))), aes(x=time, y=score))+ geom_point() + geom_line() + theme_pubr() ggplot(data.frame(time = AEM.in, score = c(scores(aem.expo.rda.fp, display="lc", choice=1))), aes(x=time, y=score))+ geom_point() + geom_smooth(span=0.3) + theme_pubr() ggplot(data.frame(time = AEM.in, score = c(scores(aem.expo.rda.fp, display="lc", choice=2))), aes(x=time, y=score))+ geom_point() + geom_smooth(span=0.5) + theme_pubr() ts1 = scores(aem.expo.rda.fp, display="lc", choice=1)[,1] #RDA axis 1 ts2 = scores(aem.expo.rda.fp, display="lc", choice=2)[,1] #RDA axis 2 AEMS = aem.expo.out.select[, expo.aem.selectvar$order] #all selected 38 AEM variables. colnames(AEMS) = paste0("AEM", expo.aem.selectvar$order) #assign original names to these variables ``` ##RDA========================RDA, redundancy analysis ```{r RDA analysis} env.select = c(6:12, 15:27, 30:58, 105:117, 119:135) # this is for scores_single_complete #do not select any clinical variables #rda.data = t(master.cpm.single) rda.data = t(master.asinh.cpm.single) rda.var = apply(rda.data, 2, var) #rda.select = apply(rda.data,2, function(x){ sum(x>0)>=50}) #rda.data= rda.data[, rda.select] rda.env = scores_single_complete[, env.select] names(rda.env)[apply(rda.env, 2, function(x){ sum(!is.na(x)) >= 280})] #this tests to see how complete each feature is #dbRDA #CRUDE DATA imputation items = c("MeanDew_PointC", "median.particle", "durationOut", "elevation.m.", "mNDVI", "Overall.AQI.Value", "latitude", "longitude", "Population_density_people_per_sqmi", "X_Mean_Wind_SpeedKm.h", "mFPAR", "dPM2.5", "dPM10", "mLAI", "X_Mean_VisibilityKm" ,"X_Mean_Sea_Level_PressurehPa", "population2", "dSO2", "dNO2", "dOzone", "dCO", "daily_NO2", "In_in_a_day", "X_Max_Sea_Level_PressurehPa", "X_Min_Sea_Level_PressurehPa", "X_Max_Wind_SpeedKm.h", "mNO2", "mSO2") for (x in items){ print(x) rda.env[,x][is.na(rda.env[, x])] = as.numeric(median(rda.env[, x], na.rm=TRUE)) } rda.env$urban_rural[is.na(rda.env$urban_rural)] = "rural" rda.env$batch = as.numeric(rda.env$batch) names(rda.env)[apply(rda.env, 2, function(x){ sum(!is.na(x)) >= 280})] names(rda.env)[apply(rda.env, 2, function(x){ sum(!is.na(x)) <= 250})] rda.env = rda.env[, names(rda.env)[apply(rda.env, 2, function(x){ sum(!is.na(x)) >= 280})]] #eleminating some obvious redudant variables rda.env.location = subset(rda.env, select=-c(Type, geo, date.mid, Precipitationmm, ownership, country, population2, city)) rda.env = subset(rda.env, select=-c(Type, geo, date.mid, Precipitationmm, ownership, country, population2, city, location ))#Type only has one class #Recoding variables location.bi = model.matrix(~rda.env.location[,"location"]+0) geo2.bi = model.matrix(~rda.env[, "geo2"]+0) season.bi = model.matrix(~rda.env[, "season"]+0) weekend.bi = model.matrix(~rda.env[, "weekend"]+0)[,-1] hiking.bi = model.matrix(~rda.env[, "hiking"]+0)[,-1] geo3.bi = model.matrix(~rda.env[, "geo3"]+0) aownership.bi = model.matrix(~rda.env[, "aownership"]+0) urban.bi = model.matrix(~rda.env[, "urban_rural"]+0)[, -1] #all numeric ones rda.env.numeric = Filter(is.numeric, rda.env) #MAKING env great againt #====================== #using location.bi, instead of MEM.select if (FALSE){ rda.env.coding.location = cbind(rda.env.numeric, location.bi, geo2.bi, season.bi, weekend.bi, hiking.bi, geo3.bi, aownership.bi, urban.bi) rda.env.coding.location = subset(rda.env.coding.location, select= -c(cDNA_conc)) #removing these variables #cDNA_conc, while a good variable for variation analysis, cannot be easily grouped into existing groups, such as environment/location/lifestyle #with raw location expo.RDA.location = rda(rda.data ~., data = rda.env.coding.location, add=TRUE, na.action = na.exclude) expo.dbRDA.location = capscale(rda.data ~ ., dist="bray", data=rda.env.coding.location, add=TRUE, na.action = na.exclude) RDA.location.radj = RsquareAdj(expo.RDA.location)$adj.r.squared dbRDA.location.radj = RsquareAdj(expo.dbRDA.location)$adj.r.squared #anova #raw R2 are about 46% and 48.2% for RDA and dbRDA. anova.expo.location.RDA = anova(expo.RDA.location) anova.expo.dbRDA.location = anova(expo.dbRDA.location) } #==================== #attaching MEM variables to the rda.env data.frame, USING MEM instead of locations! also attaching time-series variables, which are the two axes from AEM constructions rda.env.coding = cbind(rda.env.numeric, MEM.select, geo2.bi, season.bi, weekend.bi, hiking.bi, geo3.bi, aownership.bi, urban.bi, ts1, ts2) #with MEMs rda.env.coding2 = subset(rda.env.coding, select =-c(cDNA_conc)) #cDNA_conc apparently accout for a lot of variances.. expo.RDA=rda(rda.data ~ ., data=rda.env.coding2, add=TRUE, na.action = na.exclude) #33.88% explained expo.dbRDA = capscale(rda.data ~ ., dist="bray", data=rda.env.coding2, add=TRUE, na.action = na.exclude) #35.16% RDA.radj = RsquareAdj(expo.RDA)$adj.r.squared dbRDA.radj = RsquareAdj(expo.dbRDA)$adj.r.squared #anova #anova(expo.RDA, by="axis") #raw R2 are about 32.7% for RDA and dbRDA when using MEM anova.expo.RDA = anova(expo.RDA) anova.expo.dbRDA = anova(expo.dbRDA) #using adespatial for forward selection RDA.fwd = forward.sel(rda.data, rda.env.coding2, adjR2thresh = RDA.radj, alpha = 0.05) RDA.location.fwd = forward.sel(rda.data, rda.env.coding.location, adjR2thresh = RDA.location.radj, alpha = 0.05) #RDA with all selected variables expo.RDA.select=rda(rda.data ~ ., data=rda.env.coding2[, RDA.fwd$variables], add=TRUE, na.action = na.exclude) #22.2% explained expo.dbRDA.select = capscale(rda.data ~ ., dist="bray", data=rda.env.coding2[, RDA.fwd$variables], add=TRUE, na.action = na.exclude) #22.7% #vegan variable selection procedure, not divided into group yet, for rda #using MEM variables, this should be the same as forward.sel mod0 = rda(rda.data ~ 1, rda.env.coding2) #model with intercept only mod1 = rda(rda.data ~., rda.env.coding2) #model with all variables step.res = ordiR2step(mod0, mod1, perm.max =1000) #about 22% explained #vegan variable selection procedure overall, for dbRDA #using MEM variables mod0.db = capscale(rda.data ~ 1, dist="bray", data=rda.env.coding2, add=TRUE) mod1.db = capscale(rda.data ~., dist="bray", data=rda.env.coding2, add=TRUE) step.res.db = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about 18% explained if(FALSE){ #raw locations are no longer used #vegan variable selection procedure, not divided into group yet, for rda #NOT using MEM variables, RAW LOCATION labels mod0 = rda(rda.data ~ 1, rda.env.coding.location) #model with intercept only mod1 = rda(rda.data ~., rda.env.coding.location) #model with all variables step.res.location = ordiR2step(mod0, mod1, perm.max =1000) #vegan variable selection procedure overall, for dbRDA #NOT using MEM variables, RAW LOCATION labels mod0.db = capscale(rda.data ~ 1, dist="bray", data=rda.env.coding.location, add=TRUE) mod1.db = capscale(rda.data ~., dist="bray", data=rda.env.coding.location, add=TRUE) step.res.db.location = ordiR2step(mod0.db, mod1.db, perm.max = 1000) } #rda.env.coding2 does not have cDNA conc. #five groups defined initially: technical, environmental, location, life style, temporal groups #forward selection within each group, and then in combined groups #1 technical = data.frame(rda.env.coding[,"batch"]) #2 env = cbind.data.frame(rda.env.coding[, c(3:14, 22:34,48,49,51,50)]) colnames(env)[26:29] = c("spring", "summer", "winter", "fall") #3 loca = cbind.data.frame(rda.env.coding[, c(18:21, 35:40, 41:47, 54:55, 65)]) colnames(loca)[3] = "popdensity" colnames(loca)[11:19] = c("Asia", "Europe", "Northeast", "Oceania", "Southeast", "Southwest", "Westcoast", "Coastal", "Inland") colnames(loca)[5:10] = c("MEM1", "MEM69", "MEM83", "MEM89", "MEM91", "MEM94") #numbers were causing issues #4 behav = cbind.data.frame(rda.env.coding[, c(15:16, 52:53, 56:64)]) colnames(behav)[5:13] = c("others", "P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8") #5 temporal = cbind.data.frame(rda.env.coding[, c(2, 66:67)]) #2 is the "date.month" #rda forward selections of every group, using MEM and AEM variables (through RDA axes) mod0 = rda(rda.data ~ 1, technical) #model with intercept only mod1 = rda(rda.data ~., technical) #model with all variables step.res.tech = ordiR2step(mod0, mod1, perm.max =1000) #about % explained mod0 = rda(rda.data ~ 1, env) #model with intercept only mod1 = rda(rda.data ~., env) #model with all variables step.res.env = ordiR2step(mod0, mod1, perm.max =1000) #about % explained step.forward.env = forward.sel(rda.data, env) mod0 = rda(rda.data ~ 1, loca) #model with intercept only mod1 = rda(rda.data ~., loca) #model with all variables step.res.loca = ordiR2step(mod0, mod1, perm.max =1000) #about % explained step.forward.loca = forward.sel(rda.data, loca) mod0 = rda(rda.data ~ 1, behav) #model with intercept only mod1 = rda(rda.data ~., behav) #model with all variables step.res.behav = ordiR2step(mod0, mod1, perm.max =1000) #about % explained behav.R2a = RsquareAdj(mod1)$adj.r.squared step.forward.behav = forward.sel(rda.data, behav, adjR2thresh = behav.R2a) mod0.db = rda(rda.data ~ 1, data=temporal, add=TRUE) mod1.db = rda(rda.data ~., data=temporal, add=TRUE) step.res.temporal = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #temporal.select = temporal[, attributes(step.res.db.temporal$terms)$term.labels] #vegan variable selection procedure for every group, for dbRDA (used in paper) #using MEM variables mod0.db = capscale(rda.data ~ 1, dist="bray", data=technical, add=TRUE) mod1.db = capscale(rda.data ~., dist="bray", data=technical, add=TRUE) step.res.db.tech = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about % explained colnames(technical) = "batch" technical.select.db = technical mod0.db = capscale(rda.data ~ 1, dist="bray", data=env, add=TRUE) mod1.db = capscale(rda.data ~., dist="bray", data=env, add=TRUE) step.res.db.env = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about % explained env.select.db = env[, attributes(step.res.db.env$terms)$term.labels] mod0.db = capscale(rda.data ~ 1, dist="bray", data=loca, add=TRUE) mod1.db = capscale(rda.data ~., dist="bray", data=loca, add=TRUE) step.res.db.loca = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about % explained loca.select.db = loca[, attributes(step.res.db.loca$terms)$term.labels] mod0.db = capscale(rda.data ~ 1, dist="bray", data=behav, add=TRUE) mod1.db = capscale(rda.data ~., dist="bray", data=behav, add=TRUE) step.res.db.behav = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about % explained behav.select.db = behav[, attributes(step.res.db.behav$terms)$term.labels] mod0.db = capscale(rda.data ~ 1, dist="bray", data=temporal, add=TRUE) mod1.db = capscale(rda.data ~., dist="bray", data=temporal, add=TRUE) step.res.db.temporal = ordiR2step(mod0.db, mod1.db, perm.max = 1000) temporal.select.db = temporal[, attributes(step.res.db.temporal$terms)$term.labels] #combining loca and behav for variable selection, because varpart deals with 4 groups locabehav = cbind(loca, behav) mod0.db = capscale(rda.data ~ 1, dist = "bray", data = locabehav, add = TRUE) mod1.db = capscale(rda.data ~., dist = "bray", data = locabehav, add = TRUE) step.res.db.locabehav = ordiR2step(mod0.db, mod1.db, perm.max = 1000) #about % explained locabehav.select.db = cbind.data.frame(loca,behav)[, attributes(step.res.db.locabehav$terms)$term.labels] #combining all for variable selection allfactors = cbind(env, loca, behav, technical,temporal) #when including TS1 and TS2, a lot more variables will be selected, indicating that these two variables explain data variances well, this is expected because they were constructed by contraining the dataset on temporal variablesl allfactors = cbind(env, loca, behav, technical, date.month=temporal[,1]) mod0.db = capscale(rda.data ~ 1, dist = "bray", data = allfactors) #using add = TRUE option will give different results, more variables mod1.db = capscale(rda.data ~., dist = "bray", data = allfactors) step.res.db.all = ordiR2step(mod0.db, mod1.db, perm.max = 1000, pin = 0.05) #about % explained all.select.db = allfactors[, attributes(step.res.db.all$terms)$term.labels] #calculating distance between samples rda.data.dist = vegdist(rda.data,method = "bray",na.rm = TRUE, diag = FALSE) #dbRDA varpart, assuming four groups dbRDA.MEM.varpart = varpart(rda.data.dist, technical.select.db, env.select.db, loca.select.db, behav.select.db) #lumping spatial and behavior together, not adding temporal group for now dbRDA.MEM.varpart = varpart(rda.data.dist, technical.select.db, env.select.db, locabehav.select.db) #lumping location and behavior together, so three groups + temporal group dbRDA.MEM.varpart = varpart(rda.data.dist, technical.select.db, env.select.db, locabehav.select.db, temporal.select.db) #the techinical is 2.6%, the enviromental, locabehav, and the temporal all account for 10% each. par(mfrow = c(1,2)) showvarparts(4) plot(dbRDA.MEM.varpart) #crude varpart without forward selection of each group dbRDA.MEM.crude.varpart = varpart(rda.data.dist, technical, env, loca, behav) dbRDA.MEM.crude.varpart = varpart(rda.data.dist, technical, env, cbind(loca, behav)) #lumping location and behavior together par(mfrow = c(1,2)) showvarparts(4) plot(dbRDA.MEM.crude.varpart) #testing individual parameters env.dbrda.select= envfit(rda.data.dist, cbind(technical.select.db, env.select.db, loca.select.db, behav.select.db), na.rm=TRUE) #testin env.dbrda= envfit(rda.data.dist, cbind(technical, env, loca, behav), na.rm=TRUE) #testing individual parameters #new adonis test if (FALSE){ adonis.result.db = adonis(rda.data ~., cbind(env.select.db, loca.select.db, behav.select.db, technical.select.db), perm = 1999, method = "bray") write.table(adonis.result.db$aov.tab, "~/Documents/Bioinfo/DNAformal/plots/supplementary/allsamples.adonis.db.csv", sep = ",", col.names = TRUE, row.names = TRUE, quote = FALSE) adonis.crude.result.db = adonis(rda.data ~., cbind( env, loca, behav, technical), perm = 1999, method = "bray") write.table(adonis.crude.result.db$aov.tab, "~/Documents/Bioinfo/DNAformal/plots/supplementary/allsamples.adonis.crude.db.csv", sep = ",", col.names = TRUE, row.names = TRUE, quote = FALSE) } if (FALSE){ #total correlation env.recoding = cbind(technical, env, loca, behav) res2 = rcorr(as.matrix(env.recoding), type = "spearman") #takes output from rcorr function, adjusting p value matrix accroding to fdr principles matrixPadjust = function(res, select = c(1:nrow(res$P))){ print(select) temp.p = c(res$P) temp.p[is.na(temp.p)] = 0 temp.p.adj = p.adjust(temp.p, method = "fdr") p.adj.matrix = matrix(temp.p.adj, nrow = ncol(res$P), ncol=ncol(res$P), dimnames = list(rownames = colnames(res$P), colnames = colnames(res$P))) return(p.adj.matrix) } adjusted.res2.p = matrixPadjust(res2) #pls package also has a function called corrplot corrplot::corrplot(res2$r, order = "hclust", p.mat = adjusted.res2.p, sig.level = 0.01, insig = "blank", diag = FALSE, tl.cex=0.5, addrect = 12, method="color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black") } #only mike #rda.data.mike = t(Mike.master.cpm) rda.data.mike = t(Mike.master.asinh.cpm) rda.env.mike = rda.env[rda.env$ownership=="Mike",] #rda.data.campus = t(campushome.master.cpm) #rda.data.campus = t(campushome.master.asinh.cpm) #rda.env.campus = rda.env[rda.env$ownership=="Mike" & rda.env$location == "Campus",] #only p2 #rda.data.patient1 = t(patient1.master.cpm) rda.data.patient1 = t(patient1.master.asinh.cpm) rda.env.patient1 = rda.env[rda.env$aownership=="P2",] names(environ)[apply(environ, 2, function(x){ sum(!is.na(x)) >= 250})] #this tests to see how complete each feature is #use this to justify the use of RDA PCA instead of CCA and CA dca.result = decorana(rda.data, iweigh=1, ira=0) dca.result plot(dca.result, display="sites") #dbRDA #condition out seems not as useful as the var partition function rda.data.dist = vegdist(rda.data,method = "bray",na.rm = TRUE, diag = FALSE) #stress plot rda.data.mds0 = isoMDS(rda.data.dist) stressplot(rda.data.mds0, rda.data.dist) #items = c("MeanDew_PointC", "median.particle", "durationOut", "elevation.m.", "mNDVI", "Overall.AQI.Value", "latitude", "longitude", "Population_density_people_per_sqmi", "X_Mean_Wind_SpeedKm.h", "mFPAR", "dPM2.5", "dPM10", "mLAI", "X_Mean_VisibilityKm" ,"X_Mean_Sea_Level_PressurehPa", "population2") if (FALSE) (#old codes #adonis test adonis.result = adonis(rda.data ~ season + median.particle + Mean_TemperatureC + MeanDew_PointC + X_Mean_Humidity + X_Mean_VisibilityKm + X_Mean_Sea_Level_PressurehPa + X_Mean_Wind_SpeedKm.h + is_there_rain + Precipitationmm + Overall.AQI.Value + dPM2.5 +dPM10 + mLAI + mFPAR + mNDVI + aownership + location + geo2 + latitude + longitude + durationOut + weekend + hiking + Population_density_people_per_sqmi + urban_rural + elevation.m. + population2 + batch, rda.env, perm = 1999, method = "bray") write.table(adonis.result$aov.tab, "~/Documents/Bioinfo/DNAformal/plots/supplementary/allsamples.adonis.csv", sep = ",", col.names = TRUE, row.names = TRUE, quote = FALSE) #adonis test for mike adonis.result.p1 = adonis(rda.data.mike ~ season + median.particle + Mean_TemperatureC + MeanDew_PointC + X_Mean_Humidity + X_Mean_VisibilityKm + X_Mean_Sea_Level_PressurehPa + X_Mean_Wind_SpeedKm.h + is_there_rain + Precipitationmm + Overall.AQI.Value + dPM2.5 +dPM10 + mLAI + mFPAR + mNDVI + location + geo2 + latitude + longitude + durationOut + weekend + hiking + Population_density_people_per_sqmi + urban_rural + elevation.m. + population2 + batch, rda.env.mike, perm = 999, method = "bray") #for p2 adonis.result.p2 = adonis(rda.data.patient1 ~ season + median.particle + Mean_TemperatureC + MeanDew_PointC + X_Mean_Humidity + X_Mean_VisibilityKm + X_Mean_Sea_Level_PressurehPa + X_Mean_Wind_SpeedKm.h + is_there_rain + Precipitationmm + Overall.AQI.Value + dPM2.5 +dPM10 + mLAI + mFPAR + mNDVI + location + geo2 + latitude + longitude + durationOut + Population_density_people_per_sqmi + urban_rural + elevation.m. + population2 + batch, rda.env.patient1, perm = 999, method = "bray") #betadisper test, test of variances homogeinity. Similar to Levene's test for anova. #Its best if groups have homogenous dispersion (variances), so their differences if significant can be attributed to their centroid (mean) beta = with(rda.env, betadisper(rda.data.dist, season)) anova(beta) TukeyHSD(beta) } ``` ```{R RDA temp} RDA = rda(rda.data ~ geo2 + location + ownership + duration + season + median.particle + Mean_TemperatureC + X_Mean_Humidity + batch + durationOut + weekend + MeanDew_PointC + X_Mean_VisibilityKm , rda.env, na.action = na.exclude) RDA.mike = rda(rda.data.mike ~ geo2 + location + duration + season + median.particle + Mean_TemperatureC + X_Mean_Humidity + batch + durationOut + weekend + MeanDew_PointC + X_Mean_VisibilityKm , rda.env.mike, na.action = na.exclude) RDA.mike = rda(rda.data.mike) #without env this is just like PCA #selecting model or RDA #envfit analysis env.mike = envfit(RDA.mike, rda.env.mike, na.rm=TRUE) plot(RDA.mike, display = "sites") plot(env.mike, p.max=0.1) tmp = with(rda.env.mike, ordisurf(RDA.mike, season, add =TRUE)) with(rda.env.mike, ordisurf(RDA.mike, seasonsummer, add = TRUE, col="green4")) mod.mike = varpart(rda.data.mike, ~ season + weekend + Mean_TemperatureC+ X_Mean_Humidity+ is_there_rain + Precipitationmm , ~ location, ~batch, data = rda.env.mike, na.rm=TRUE) mod = varpart(rda.data, ~ season + weekend + Mean_TemperatureC+ X_Mean_Humidity+ is_there_rain + Precipitationmm, ~ location+ownership, ~batch, data = rda.env) RDA.nolocation = rda(rda.data ~ geo2 + ownership + duration + season + total.particle + temperature + humid + batch, rda.env, na.action = na.exclude) RDA.noseason = rda(rda.data ~ geo2 + ownership + duration + total.particle + temperature + humid + batch, rda.env, na.action = na.exclude) #season contributes 61.35 variances explained, total variances about 2417.97 RDA = rda(rda.data.campus ~ location + duration + season + total.particle + temperature + humid + batch, rda.env.campus,na.action = na.exclude) RDA = rda(rda.data.patient1 ~ location + duration + season + total.particle + temperature + humid + batch, rda.env.patient1,na.action = na.exclude) RDA.2 = rda(rda.data ~ duration + temperature + season + total.particle + humid + batch, rda.env, na.action=na.exclude) RDA.2 = rda(rda.data.campus ~ duration + temperature + season + total.particle + humid + batch, rda.env.campus, na.action=na.exclude) RDA.2 = rda(rda.data.patient1 ~ duration + temperature + season + total.particle + humid + batch, rda.env.patient1, na.action=na.exclude) anova(RDA) plot(RDA) anova(RDA.2) plot(RDA.2) #meta plots for environmental data #reoder levels for legend rda.env$season = factor(rda.env$season, levels=c("spring", "summer", "fall", "winter")) rda.env.mike$season = factor(rda.env.mike$season, levels=c("spring", "summer", "fall", "winter")) rda.env.campus$season = factor(rda.env.campus$season, levels=c("spring", "summer", "fall", "winter")) rda.env.patient1$season = factor(rda.env.patient1$season, levels=c("spring", "summer", "fall", "winter")) #function that takes in vector of data and a coefficient, #returns boolean vector if a certain point is an outlier or not check_outlier <- function(v, coef=1.5){ quantiles <- quantile(v,probs=c(0.25,0.75)) IQR <- quantiles[2]-quantiles[1] res <- v < (quantiles[1]-coef*IQR)|v > (quantiles[2]+coef*IQR) return(res) } label = rep("", dim(rda.env)[1]) label[check_outlier(rda.env$temperature)] = as.character(rda.env$location[check_outlier(rda.env$temperature)]) ####this plot is kinda tricky, pay attention to parent-daughter function inheritence p = ggplot(rda.env.patient1, aes(x = date.month, y = temperature)) p + stat_smooth() + geom_boxplot(aes(group = date.month, color=season)) + geom_point(aes(color=season)) +scale_x_discrete(limits=c(1:12)) p = ggplot(rda.env.patient1, aes(x = date.month, y = temperature)) p + stat_smooth() + geom_point(aes(color=season)) +scale_x_discrete(limits=c(1:12)) label = rep("", dim(rda.env)[1]) label[check_outlier(log(rda.env$particle))] = as.character(rda.env$location[check_outlier(log(rda.env$particle))]) p = ggplot(rda.env, aes(x = season, y = log(particle), color=season, size = sd.particle, group = season)) p + geom_boxplot() + geom_jitter(aes(color=season), width=0.2) + geom_text_repel(aes(label=label), na.rm=TRUE, size=3) + theme_grey(base_size=15) label = rep("", dim(rda.env)[1]) label[check_outlier(rda.env$humid)] = as.character(rda.env$location[check_outlier(rda.env$humid)]) p = ggplot(rda.env.patient1, aes(x = date.month, y = humid)) p + stat_smooth() + geom_boxplot(aes(group=date.month, color=season)) + geom_point(aes(color=season)) +scale_x_discrete(limits=c(1:12)) #+ geom_text_repel(aes(label=label), na.rm=TRUE, size=3) p = ggplot(rda.env.patient1, aes(x = date.month, y = humid)) p + stat_smooth() + geom_point(aes(color=season)) +scale_x_discrete(limits=c(1:12)) #+ geom_text_repel(aes(label=label), na.rm=TRUE, size=3) #apply this to our data check_outlier(rda.env$duration)#this checks the outlier of the vector label = rep("", dim(rda.env)[1]) label[check_outlier(rda.env$duration)] = as.character(rda.env$ownership[check_outlier(rda.env$duration)]) p = ggplot(rda.env, aes(x = season, y = duration, color=season)) p + geom_boxplot() + geom_jitter() + geom_text_repel(aes(label=label), na.rm=TRUE) ``` ##GLM for every feature ```{r GLM for every feature} #library(hier.part) #can only take 13 variables, TRASH library(MuMIn) library(hier.part) library(relaimpo) library(ggtern) library(ggalt) if(FALSE){ #direct test based only on owner for four people data sss = cbind.data.frame(t(fourpeople.select.master.asinh.cpm), owner = fourpeople.select$aownership, location = fourpeople.select$location) sss.melt = melt(sss, id.vars = c("owner", "location")) sss.results = compare_means(formula = value ~ location, data = sss.melt, group.by = "variable", p.adjust.method = "fdr", method= "kruskal.test" ) #examining patterns for the four people study ggplot(sss, aes(x=location, y=Alkanindiges, label=owner)) + geom_boxplot() + geom_beeswarm() + geom_text_repel() + stat_compare_means() } ### Three groups technical, env, locabehav #all.select.db imp.data = rda.data[ , order(colSums(rda.data),decreasing=TRUE)] imp.data = imp.data[, apply(imp.data, 2, function(x){ sum(x>0)>=100})] #occuring in more than 100 samples imp.df = data.frame(matrix(nrow=12)) imp.rsq = c() imp.adjrsq = c() imp.p = c() #can only work on 12 variables or less so far, hence cutoff set at 12, the last few are the least important anyway. if(FALSE){ for (name in colnames(imp.data)){ print(name) hier.results = hier.part(rda.data[,name], all.select.db[,1:12], barplot = "FALSE", gof ="Rsqu") lm.fit = lm(rda.data[,name]~., data=all.select.db[,1:12]) lm.summary = summary(lm.fit) imp.rsq = c(imp.rsq, lm.summary$r.squared) imp.adjrsq = c(imp.adjrsq, lm.summary$adj.r.squared) imp.df = cbind(imp.df, hier.results$I.perc) print(hier.results$I.perc) imp.p = c(imp.p, pf(lm.summary$fstatistic[1], lm.summary$fstatistic[2], lm.summary$fstatistic[3], lower.tail=F)) } imp.df = imp.df[, -1] colnames(imp.df) = colnames(imp.data) imp.df.final = rbind(imp.df, R2 = imp.rsq, adjR2 = imp.adjrsq, p = imp.p) imp.df.final = rbind(imp.df.final, padjust = p.adjust(imp.df.final["p",], method="fdr")) imp.df.fin = t.data.frame(imp.df.final) write.csv(file = "~/Documents/Expo_paper/fvarpart.csv", x = imp.df.fin) } #using calc.relimp from the relaimpo package, no limitation on the number of testing variables imposed relaimp.df = data.frame(matrix(nrow=ncol(all.select.db))) relaimp.rsq = c() relaimp.adjrsq = c() relaimp.p = c() relaimp.var.y = c() for (name in colnames(imp.data)){ name="Stereum" print(name) lm.fit = lm(rda.data[,name]~., data=all.select.db) lm.rela = calc.relimp(lm.fit, type = c("lmg"), rela = TRUE) #this function actually has a groups option to combine variables into groups relaimp.df = cbind(relaimp.df, lm.rela@lmg) relaimp.var.y = c(relaimp.var.y, lm.rela@var.y) lm.summary = summary(lm.fit) relaimp.rsq = c(relaimp.rsq, lm.summary$r.squared) relaimp.adjrsq = c(relaimp.adjrsq, lm.summary$adj.r.squared) relaimp.p = c(relaimp.p, pf(lm.summary$fstatistic[1], lm.summary$fstatistic[2], lm.summary$fstatistic[3], lower.tail=F)) } relaimp.df = relaimp.df[, -1] colnames(relaimp.df) = colnames(imp.data) relaimp.df = relaimp.df * 100 relaimp.df.final = rbind(relaimp.df, R2 = relaimp.rsq, adjR2 = relaimp.adjrsq, p = relaimp.p, yvar = relaimp.var.y) relaimp.df.final = rbind(relaimp.df.final, padjust = p.adjust(relaimp.df.final["p",], method="fdr")) relaimp.df.fin = t.data.frame(relaimp.df.final) relaimp.df.fin = as.data.frame(relaimp.df.fin) write.csv(file = "~/Documents/Expo_paper/fvarpart_final_2.csv", x = relaimp.df.fin) env.index = c(3,4,6,7,9,11,14,17) locabehav.index = c(2,5,8,10,12,13,15,16) tech.index = c(1) env.imp = apply(relaimp.df.fin, 1, function(x){sum(x[env.index])}) locabehav.imp = apply(relaimp.df.fin, 1, function(x){sum(x[locabehav.index])}) tech.imp = apply(relaimp.df.fin, 1, function(x){sum(x[tech.index])}) r2 = as.numeric(relaimp.df.fin$R2) input = read.csv("~/Documents/Expo_paper/fvarpart_final.csv") #contains taxonomy info, otherwsie same to fvarpart_final_2.csv taxonomy = input$tax tern.df.raw = cbind.data.frame(Env = as.numeric(env.imp), Spa = as.numeric(locabehav.imp), Tec = as.numeric(tech.imp), R2 = r2, Padj = relaimp.df.fin$padjust, taxonomy = taxonomy) rownames(tern.df.raw) = rownames(relaimp.df.fin) tern.df.raw = tern.df.raw[as.numeric(tern.df.raw$Padj)< 0.05,] tern.df = cbind(Env= as.numeric(env.imp)/100*r2, Spa=as.numeric(locabehav.imp)/100*r2, Tec=as.numeric(tech.imp)/100*r2, R2 = r2, Padj=relaimp.df.fin$padjust, taxonomy = taxonomy) rownames(tern.df) = rownames(relaimp.df.fin) tern.df = as.data.frame(tern.df) tern.df = tern.df[as.numeric(tern.df$Padj)< 0.05,] tern.df[,1:5] = apply(tern.df[,1:5], 2, as.numeric) tern.df$taxonomy = factor(tern.df$taxonomy, levels = c("bacteria", "fungi", "plant", "animal")) tern.df$dominating = rep("norm", nrow(tern.df)) tern.df$dominating[tern.df$Env/tern.df$Spa>=1.5] = "Env" tern.df$dominating[tern.df$Spa/tern.df$Env>=1.5] = "Spa" tern.melt = melt(tern.df, id.vars=c("Env", "Spa", "Tec", "R2","Padj", "dominating")) tern.melt$value = factor(tern.melt$value, levels = c("bacteria", "fungi", "plant", "animal")) tern.melt$dominating = factor(tern.melt$dominating, levels = c("norm", "Env", "Spa")) #original ggtern(tern.melt,aes(Env,Spa,Tec, size=R2, color=value, group=value, fill=value)) + geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) + geom_point(alpha=0.5) + #geom_encircle(alpha=0.1,size=1) + theme_bw(base_size = 20) + theme_showarrows() + theme_notitles() + guides(shape=FALSE) + scale_color_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))+ scale_fill_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))+ facet_wrap( ~ value,scales="free", ncol=4) #introducing dominating influences ggtern(tern.melt,aes(Env,Spa,Tec, size=R2, color=dominating, group=value)) + geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) + geom_point(alpha=0.5) + #geom_encircle(alpha=0.1,size=1) + theme_bw(base_size = 20) + theme_showarrows() + theme_notitles() + guides(shape=FALSE) + scale_color_manual(values=c("#F8766D","#00AFBB", "#E7B800", "#00BFC4"))+ scale_fill_manual(values=c("#F8766D", "#00AFBB", "#E7B800", "#00BFC4"))+ facet_wrap( ~ value,scales="free", ncol=4) #plot all ggtern(tern.melt,aes(Env,Spa,Tec, color=dominating, size=R2, shape = value)) + #geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) + geom_point(alpha=0.5) + #geom_encircle(alpha=0.1,size=1) + theme_bw(base_size = 20) + theme_showarrows() + theme_notitles() + scale_color_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))+ scale_fill_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4")) #facet_wrap( ~ value,scales="free", ncol=4) #seems that have to do variance partition the hardcore way, planning to use relative of importance per variables as the value to calculate variance partitioning #individual testing fungi.thre = quantile(tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$R2, 0.5) sum(tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$Env <= 1 & tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$R2>=fungi.thre) #Spa < env View(subset(tern.df.raw, taxonomy == "fungi" & Env/Spa >= 1.5 & R2>= fungi.thre)) sum(tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$Env >= 1 & tern.df.raw[tern.df.raw$taxonomy=="fungi", ]$R2>=fungi.thre) #Spa > env View(subset(tern.df.raw, taxonomy == "fungi" & Spa/Env >= 1.5 & R2>= fungi.thre)) chisq.test(x=rbind(c(72/2,72/2),c(50,22))) #testing bac.thre = quantile(tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$R2, 0.5) sum(tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$Env <= 1 & tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$R2>=bac.thre) #Spa < env View(subset(tern.df.raw, taxonomy == "bacteria" & Env/Spa >= 1.5 & R2>= bac.thre)) sum(tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$Env >= 1 & tern.df.raw[tern.df.raw$taxonomy=="bacteria", ]$R2>=bac.thre) #Spa > env View(subset(tern.df.raw, taxonomy == "bacteria" & Spa/Env >= 1.5 & R2>= bac.thre)) chisq.test(x=rbind(c(21/2,21/2),c(13,8))) plant.thre = quantile(tern.df.raw[tern.df.raw$taxonomy=="plant", ]$R2, 0.5) sum(tern.df.raw[tern.df.raw$taxonomy=="plant", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="plant", ]$Env <= 1 & tern.df.raw[tern.df.raw$taxonomy=="plant", ]$R2>=plant.thre) #Spa < env View(subset(tern.df.raw, taxonomy == "plant" & Env/Spa >= 1.5)) sum(tern.df.raw[tern.df.raw$taxonomy=="plant", ]$Spa/tern.df.raw[tern.df.raw$taxonomy=="plant", ]$Env >= 1 & tern.df.raw[tern.df.raw$taxonomy=="plant", ]$R2>=plant.thre) #Spa > env View(subset(tern.df.raw, taxonomy == "plant" & Spa/Env >= 1.5)) ``` ##GLM test bootstrapping ```{r GLM for every feature} #library(hier.part) #can only take 13 variables, TRASH library(MuMIn) library(hier.part) library(relaimpo) library(ggtern) library(ggalt) ### Three groups technical, env, locabehav #all.select.db imp.data = rda.data[ , order(colSums(rda.data),decreasing=TRUE)] imp.data = imp.data[, apply(imp.data, 2, function(x){ sum(x>0)>=100})] #occuring in more than 100 samples #using calc.relimp from the relaimpo package, no limitation on the number of testing variables imposed cicalc = function(delta, ref){ #90% confidence interval delta = sort(delta) range = quantile(delta, c(0.05, 0.95)) ratio.ci = as.numeric(ref) - as.numeric(c(range[2], range[1])) names(ratio.ci) = c("5%", "95%") names(ref) = "reference" return(c(ref, ratio.ci)) } ecdf_fun = function(x,perc) {ecdf(x)(perc)} #retrun percentile given value in a distribution #This part takes a lot of time, about 20 hours, result is saved as a rds object relaimp.df = data.frame(matrix(nrow=3)) relaimp.rsq = c() relaimp.adjrsq = c() relaimp.p = c() relaimp.var.y = c() ratio.master = list() for (name in as.character(input$X[input$padjust<0.05])){ #only on genera that occured in more than 100 samples #name="Stereum" print(name) lm.fit = lm(rda.data[,name]~., data=all.select.db) lm.rela = calc.relimp(lm.fit, type = c("lmg"), rela = TRUE, groups = list(env= c(3,4,6,7,9,11,14,17)+1, spa=c(2,5,8,10,12,13,15,16)+1, tech=c(1)+1)) #this function actually has a groups option to combine variables into groups #group 1 is env, group 2 is spa, group 3 is tech reference.temp = lm.rela@lmg #relaimp.df = cbind(relaimp.df, lm.rela@lmg) relaimp.var.y = c(relaimp.var.y, lm.rela@var.y) lm.summary = summary(lm.fit) relaimp.rsq = c(relaimp.rsq, lm.summary$r.squared) relaimp.adjrsq = c(relaimp.adjrsq, lm.summary$adj.r.squared) refratio = reference.temp[1]/reference.temp[2] refenv = reference.temp[1] refloc = reference.temp[2] reftech = reference.temp[3] data.slice = rda.data[,name] set.seed = 123 bootstrapping.matrix = replicate(9999, sample(1:283, 283, replace = TRUE)) #each COLUMN is a sampling relaimp.df.temp = data.frame(matrix(nrow=3)) #relaimp.var.y.temp = c() for (i in 1:ncol(bootstrapping.matrix)){ #print(i) rda.data.temp = data.slice[bootstrapping.matrix[,i]] all.select.db.temp = all.select.db[bootstrapping.matrix[,i],] lm.fit.temp = lm(rda.data.temp~., data=all.select.db.temp) lm.rela.temp = calc.relimp(lm.fit.temp, type = c("lmg"), rela = TRUE, groups = list(env= c(3,4,6,7,9,11,14,17)+1, spa=c(2,5,8,10,12,13,15,16)+1, tech=c(1)+1)) #this function actually has a groups option to combine variables into groups relaimp.df.temp = cbind(relaimp.df.temp, lm.rela.temp@lmg) #relaimp.var.y.temp = c(relaimp.var.y.temp, lm.rela.temp@var.y) } relaimp.df.temp = relaimp.df.temp[,-1] resample.ratio = unlist(relaimp.df.temp[1,]/relaimp.df.temp[2,]) resample.env = unlist(relaimp.df.temp[1,]) resample.loc = unlist(relaimp.df.temp[2,]) resample.tech = unlist(relaimp.df.temp[3,]) deltaratio = refratio - resample.ratio deltaenv =refenv - resample.env deltaloc = refloc - resample.loc deltatech = reftech - resample.tech #range = quantile(deltaratio.sort, c(0.05, 0.95)) #ratio.ci = as.numeric(refratio) - as.numeric(c(range[2], range[1])) #names(ratio.ci) = c("5%", "95%") #names(refratio) = "reference" #ratio.ci ratio.master[[name]] = data.frame(ratio = cicalc(deltaratio, refratio), env = cicalc(deltaenv, refenv), loc = cicalc(deltaloc, refloc), tech = cicalc(deltatech, reftech), onequantile = ecdf_fun(resample.ratio,1) #quantile of probability of env/loc <= 1 ) } #saveRDS(object = ratio.master, file = "~/Documents/Revision2/bootstrapping.rds") ratio.master = readRDS(file = "~/Documents/Revision2/bootstrapping.rds") #extracting all information pertaining different group of variables moreenv = c() moreloc = c() rest = c() allquantile = c() allratio = c() allratioupper = c() allratiolower = c() allenv = c() allenvupper = c() allenvlower = c() allloc = c() alllocupper = c() allloclower = c() alltech = c() alltechupper = c() alltechlower = c() #chance of observing env/loc <= 1 in bootstrapping samples for(speciesname in names(ratio.master)){ allquantile = c(allquantile, ratio.master[[speciesname]]$onequantile[1]) allratio = c(allratio, ratio.master[[speciesname]]$ratio[1]) allratiolower = c(allratiolower, ratio.master[[speciesname]]$ratio[2]) allratioupper = c(allratioupper, ratio.master[[speciesname]]$ratio[3]) allenv = c(allenv, ratio.master[[speciesname]]$env[1]) allenvlower =c(allenvlower, ratio.master[[speciesname]]$env[2]) allenvupper = c(allenvupper, ratio.master[[speciesname]]$env[3]) allloc = c(allloc, ratio.master[[speciesname]]$loc[1]) allloclower =c(allloclower, ratio.master[[speciesname]]$loc[2]) alllocupper = c(alllocupper, ratio.master[[speciesname]]$loc[3]) alltech = c(alltech, ratio.master[[speciesname]]$tech[1]) alltechlower =c(alltechlower, ratio.master[[speciesname]]$tech[2]) alltechupper = c(alltechupper, ratio.master[[speciesname]]$tech[3]) temp.value = ratio.master[[speciesname]]$onequantile[1] if (temp.value >= 0.8) { print(temp.value) moreloc = c(moreloc, speciesname)} else if (temp.value <= 0.2) { print(temp.value) moreenv = c(moreenv, speciesname)} else {rest = c(rest, speciesname)} } #relaimp.rsq = c() #relaimp.adjrsq = c() #relaimp.p = c() #relaimp.var.y = c() taxinfo = read.csv("~/Documents/Expo_paper/fvarpart_final_v3.csv") #contains phylum info quantile.df.report = data.frame(name = names(ratio.master), #for paper tables Prob_of_spa_dominating = allquantile,#chance of observing env/loc <= 1 in bootstrapping samples env_spa_ratio = allratio, ratioupper = allratioupper, ratiolower = allratiolower, env = allenv, envupper = allenvupper, envlower = allenvlower, spa = allloc, spaupper = alllocupper, spalower = allloclower, tech = alltech, techupper = alltechupper, techlower = alltechlower, rsq = relaimp.rsq, adjrsq = relaimp.adjrsq, taxon = input$tax[match(names(ratio.master),input$X)] ) quantile.df.report$phylum = taxinfo$phylum[match(quantile.df.report$name, taxinfo$X)] write.csv(file = "~/Documents/Revision2/alldataCI.csv", x = quantile.df.report) #contains all relevant information for confidence interval estimation, dominating influence definition, and ternary plots quantile.df = data.frame(name = names(ratio.master), onequantile = allquantile,#chance of observing env/loc <= 1 in bootstrapping samples ratio = allratio, ratioupper = allratioupper, ratiolower = allratiolower, rsq = relaimp.rsq, adjrsq = relaimp.adjrsq, vary = relaimp.var.y, taxon = input$tax[match(names(ratio.master),input$X)] ) #saveRDS(quantile.df, file= "~/Documents/Revision2/quantile.df.rds") # simplied format of the quantile.df.report taxinfo = read.csv("~/Documents/Expo_paper/fvarpart_final_v3.csv") #contains phylum info quantile.df$phylum = taxinfo$phylum[match(quantile.df$name, taxinfo$X)] quantile.df = quantile.df[order(quantile.df$adjrsq,decreasing =TRUE),] quantile.df.select = quantile.df[1:199,] #taking all significant models #View(quantile.df.select) View(subset(quantile.df.select, taxon == "fungi")) #chance of observing env/loc <= 1 in bootstrapping samples View(subset(quantile.df.select, onequantile >= 0.9 & taxon == "fungi")) #new definition of dominating envriomnetal View(subset(quantile.df.select, onequantile <= 0.1 & taxon == "fungi")) #new definition of dominating spatial #find quantile of certain value View(subset(quantile.df.select, onequantile >= 0.9 & taxon == "bacteria")) #new definition of dominating envriomnetal View(subset(quantile.df.select, onequantile <= 0.1 & taxon == "bacteria")) View(subset(quantile.df.select, onequantile >= 0.9 & taxon == "plant")) #new definition of dominating envriomnetal View(subset(quantile.df.select, onequantile <= 0.1 & taxon == "plant")) View(subset(quantile.df.select, onequantile >= 0.9 & taxon == "animal")) #new definition of dominating envriomnetal View(subset(quantile.df.select, onequantile <= 0.1 & taxon == "animal")) #View(quantile.df[quantile.df$ratio >= 2 | quantile.df$ratio <= 1/2, ]) #View(quantile.df[quantile.df$onequantile >= 0.9 | quantile.df$onequantile <=0.1, ]) #View(quantile.df[quantile.df$onequantile >= 0.95 | quantile.df$onequantile <=0.05, ]) quantile.df.report$taxon = factor(quantile.df.report$taxon, levels = c("bacteria", "fungi", "plant", "animal")) #coordinating color #original #changing names of columns, only for plotting reasons colnames(quantile.df.report)[c(6,9,12,15,16,17,18)] = c("Env", "Spa", "Tec", "R2", "Adj.R2", "Taxon", "Phylum") ggtern(quantile.df.report,aes(Env,Spa,Tec, size=R2, color=Taxon, group=Taxon, fill=Taxon)) + geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) + geom_point(alpha=0.5) + #geom_encircle(alpha=0.1,size=1) + theme_bw(base_size = 20) + theme_showarrows() + theme_notitles() + guides(shape=FALSE) + scale_color_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))+ scale_fill_manual(values=c("#F8766D", "#B79F00", "#00BA38", "#00BFC4"))+ facet_wrap( ~ Taxon,scales="free", ncol=4) #preparing dominating variable quantile.df.report$Dominating = "Neither" quantile.df.report$Dominating[quantile.df.report$Prob_of_spa_dominating>=0.9] = "Spa" quantile.df.report$Dominating[quantile.df.report$Prob_of_spa_dominating<=0.1] = "Env" quantile.df.report$Dominating = factor(quantile.df.report$Dominating, levels = c("Neither", "Env", "Spa")) #introducing dominating influences ggtern(quantile.df.report,aes(Env,Spa,Tec, size=R2, color=Dominating, group=Taxon)) + geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) + geom_point(alpha=0.5) + #geom_encircle(alpha=0.1,size=1) + theme_bw(base_size = 20) + theme_showarrows() + theme_notitles() + guides(shape=FALSE) + scale_color_manual(values=c("#F8766D","#00AFBB", "#E7B800", "#00BFC4"))+ scale_fill_manual(values=c("#F8766D", "#00AFBB", "#E7B800", "#00BFC4"))+ facet_wrap( ~ Taxon,scales="free", ncol=4) #plotting each domain of life separately for better control ggtern(subset(quantile.df.report, Taxon == "fungi"),aes(Env,Spa,Tec, size=R2, color=Dominating, group=Taxon, shape = Phylum)) + geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) + geom_point(alpha=0.5) + #geom_encircle(alpha=0.1,size=1) + theme_bw(base_size = 20) + theme_showarrows() + theme_notitles() + guides(shape=FALSE) + scale_color_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))+ scale_fill_manual(values=c("#778899", "#00AFBB", "#E7B800", "#00BFC4"))+ scale_shape_manual(values = c(16, 18)) + scale_size_continuous(limits = c(0.1, 0.4)) #facet_wrap( ~ Taxon,scales="free", ncol=4) #bacteria ggtern(subset(quantile.df.report, Taxon == "bacteria"),aes(Env,Spa,Tec, size=R2, color=Dominating, group=Taxon)) + geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) + geom_point(alpha=0.5) + #geom_encircle(alpha=0.1,size=1) + theme_bw(base_size = 20) + theme_showarrows() + theme_notitles() + guides(shape=FALSE) + scale_color_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))+ scale_fill_manual(values=c("#778899", "#00AFBB", "#E7B800", "#00BFC4"))+ scale_shape_manual(values = c(16, 18)) + scale_size_continuous(limits = c(0.1, 0.4)) #make the smaller dots bigger when sample size is smaller #plant ggtern(subset(quantile.df.report, Taxon == "plant"),aes(Env,Spa,Tec, size=R2, color=Dominating, group=Taxon, shape = Phylum)) + geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) + geom_point(alpha=0.5) + #geom_encircle(alpha=0.1,size=1) + theme_bw(base_size = 20) + theme_showarrows() + theme_notitles() + guides(shape=FALSE) + scale_color_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))+ scale_fill_manual(values=c("#778899", "#00AFBB", "#E7B800", "#00BFC4"))+ scale_shape_manual(values = c(16, 18)) + scale_size_continuous(limits = c(0.1, 0.3)) #ANIAMLS ggtern(subset(quantile.df.report, Taxon == "animal"),aes(Env,Spa,Tec, size=R2, color=Dominating, group=Taxon, shape = Phylum)) + geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) + geom_point(alpha=0.5) + #geom_encircle(alpha=0.1,size=1) + theme_bw(base_size = 20) + theme_showarrows() + theme_notitles() + guides(shape=FALSE) + scale_color_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))+ scale_fill_manual(values=c("#778899", "#00AFBB", "#E7B800", "#00BFC4"))+ scale_shape_manual(values = c(16, 18)) + scale_size_continuous(limits = c(0.10, 0.35)) #facet_wrap( ~ Taxon,scales="free", ncol=4) #plot all ggtern(quantile.df.report,aes(Env,Spa,Tec, size=R2, color=dominating, shape=Taxon)) + #geom_confidence_tern(breaks = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9), contour=TRUE, alpha=0.8, size=0.2) + geom_point(alpha=0.5) + #geom_encircle(alpha=0.1,size=1) + theme_bw(base_size = 20) + theme_showarrows() + theme_notitles() + scale_color_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4"))+ scale_fill_manual(values=c("#778899","#00AFBB", "#E7B800", "#00BFC4")) #facet_wrap( ~ value,scales="free", ncol=4) #chi-sq test #143 fungi, 60 ascomycota, 83 basidiomycota #%80 #dominating environmental chisq.test(x=rbind(c(29*60/143,29*83/143),c(0,29))) #dominating spatial chisq.test(x=rbind(c(35*60/143,35*83/143),c(26,9))) #%90 #dominating env 20/20 chisq.test(x=rbind(c(20*60/143,20*83/143),c(0,20))) #dominating spatial 19/21 chisq.test(x=rbind(c(21*60/143,21*83/143),c(19,2))) ``` ##GLM test permutation test, this is to calculate P values of every measure statistics, noted the p value is 1-calculated value ```{r GLM for every feature} #library(hier.part) #can only take 13 variables, TRASH library(MuMIn) library(hier.part) library(relaimpo) library(ggtern) library(ggalt) ### Three groups technical, env, locabehav #all.select.db imp.data = rda.data[ , order(colSums(rda.data),decreasing=TRUE)] imp.data = imp.data[, apply(imp.data, 2, function(x){ sum(x>0)>=100})] #occuring in more than 100 samples #using calc.relimp from the relaimpo package, no limitation on the number of testing variables imposed ecdf_fun = function(x,perc) {ecdf(x)(perc)} #retrun percentile given value in a distribution #This part takes a lot of time, about 20 hours, result is saved as a rds object #relaimp.df = data.frame(matrix(nrow=3)) #relaimp.rsq = c() #relaimp.adjrsq = c() #relaimp.p = c() #relaimp.var.y = c() perm.master = list() for (name in as.character(input$X[input$padjust<0.05])){ #only on genera that occured in more than 100 samples #name="Stereum" print(name) lm.fit = lm(rda.data[,name]~., data=all.select.db) lm.rela = calc.relimp(lm.fit, type = c("lmg"), rela = FALSE, groups = list(env= c(3,4,6,7,9,11,14,17)+1, spa=c(2,5,8,10,12,13,15,16)+1, tech=c(1)+1)) #this function actually has a groups option to combine variables into groups #group 1 is env, group 2 is spa, group 3 is tech reference.temp = lm.rela@lmg #relaimp.df = cbind(relaimp.df, lm.rela@lmg) refratio = reference.temp[1]/reference.temp[2] refenv = reference.temp[1] refloc = reference.temp[2] reftech = reference.temp[3] data.slice = rda.data[,name] set.seed = 123 bootstrapping.matrix = replicate(10000, sample(1:283, 283, replace = FALSE)) #each COLUMN is a sampling relaimp.df.temp = data.frame(matrix(nrow=3)) #relaimp.var.y.temp = c() for (i in 1:ncol(bootstrapping.matrix)){ #print(i) rda.data.temp = data.slice #only permutates data labels all.select.db.temp = all.select.db[bootstrapping.matrix[,i],] #only permutate data labels lm.fit.temp = lm(rda.data.temp~., data=all.select.db.temp) lm.rela.temp = calc.relimp(lm.fit.temp, type = c("lmg"), rela = FALSE, groups = list(env= c(3,4,6,7,9,11,14,17)+1, spa=c(2,5,8,10,12,13,15,16)+1, tech=c(1)+1)) #this function actually has a groups option to combine variables into groups relaimp.df.temp = cbind(relaimp.df.temp, lm.rela.temp@lmg) #relaimp.var.y.temp = c(relaimp.var.y.temp, lm.rela.temp@var.y) } relaimp.df.temp = relaimp.df.temp[,-1] #first column was artifical #permutation results resample.ratio = unlist(relaimp.df.temp[1,]/relaimp.df.temp[2,]) resample.env = unlist(relaimp.df.temp[1,]) resample.loc = unlist(relaimp.df.temp[2,]) resample.tech = unlist(relaimp.df.temp[3,]) ratiopercentile = ecdf_fun(resample.ratio, refratio) envpercentile =ecdf_fun(resample.env, refenv) #essentially 1, so lower than minimum possible p value locpercentile = ecdf_fun(resample.loc, refloc) #essentially 1, so lower than minimum possible p value techpercentile = ecdf_fun(resample.tech, reftech) #range = quantile(deltaratio.sort, c(0.05, 0.95)) #ratio.ci = as.numeric(refratio) - as.numeric(c(range[2], range[1])) #names(ratio.ci) = c("5%", "95%") #names(refratio) = "reference" #ratio.ci perm.master[[name]] = data.frame(ratioperc = ratiopercentile, env = envpercentile, loc = locpercentile, tech = techpercentile ) } saveRDS(object = perm.master, file = "~/Documents/Revision2/permutation.rds") perm.master.matrix = matrix(unlist(perm.master), ncol=4) rownames(perm.master.matrix) = names(perm.master) colnames(perm.master.matrix) = c("ratioperc", "env", "loc", "tech") ``` A remarkable feature of the `glmnet` function is that it fits the model not only for one choice of $\lambda$, but for all possible $\lambda$s at once. For now, let's look at the prediction performance for, say, $\lambda=0.04$. The name of the function parameter is `s`: ```{general_nested_cv_function} #data.frame has features as rows and samples as columns cvnested = function(data.frame, k, p){ #data.frame to train on, k is for outter fold, p is for inner fold print(ncol(data.frame)) flds <- createFolds(1:ncol(data.frame), k = k, list = TRUE, returnTrain = FALSE) #print(length(flds)) for (i in 1:length(flds)){#outter test.outter = data.frame[, c(flds[[i]])] #print(flds[[i]]) train.outter = data.frame[, -c(flds[[i]])] #print(dim(train.outter)) #the following part is needed for models without CV builtin to evaluate model parameter, such like lambda in glmnet, glmnet for example would not need this step flds.inner = createFolds(1:ncol(train.outter), k = p, list = TRUE, returnTrain = FALSE) for (j in 1:length(flds.inner)){#inner test.inner = train.outter[, c(flds.inner[[j]])] #print(flds.inner[[j]]) train.inner = train.outter[, -c(flds[[j]])] #print(dim(train.inner)) }#inner #test model selected by inner on the outter test set }#outter #fit the whole dataset to modle chosen, final model } cvnested(campushome.master.asinh.cpm, 10,10) ``` ##ML for season======================Machine learning for season ```{r formal_glmnet_model_validation} #data.frame has features as rows and samples as columns #check model performance function, arguments are vectors of predicted and actual labels check.model.performance = function(predicted.class, actual.class){ result.tbl = as.data.frame(table(as.character(predicted.class),as.character(actual.class))) colnames(result.tbl)[1:2] = c("Pred","Truth") #print(result.tbl) control = 0 ##MACRO averaging here for (class in unique(result.tbl$Truth) ){ control = control + 1 #print(class) tp = sum(result.tbl[result.tbl$Pred==class & result.tbl$Truth==class, "Freq"]) tpandfp = sum(result.tbl[result.tbl$Pred == class , "Freq" ]) #TP plus FP fp = tpandfp - tp tpandfn = sum(result.tbl[result.tbl$Truth == class , "Freq" ]) #TP plus FN fn = tpandfn - tp tn = sum(result.tbl[result.tbl$Truth != class & result.tbl$Pred != class, "Freq"]) #print(tn) presi = tp/tpandfp rec = tp/tpandfn accuracy = (tp + tn)/(tp+tn+fp+fn) spec = tn/(tn + fp) F.score = 2*presi*rec/(presi+rec) MCC = ((tp*tn)-(fn*fp))/sqrt((tp+fn)*(tn+fp)*(tp+fp)*(tn+fn)) if (control == 1 ) metrics = tibble(class = as.character(class), presi = as.numeric(presi), recall = as.numeric(rec), accuracy = as.numeric(accuracy), spec = as.numeric(spec), F.score = as.numeric(F.score), MCC = as.numeric(MCC)) if (control > 1 ) metrics = rbind(metrics,c(as.character(class), as.numeric(presi), as.numeric(rec), as.numeric(accuracy), as.numeric(spec), as.numeric(F.score), as.numeric(MCC))) } return(metrics) } #cvnest function, p is not used for glmnet predictions cvnested = function(data.frame, truth.frame, external.frame, external.truth, k, p){ #data.frame to train on, k is for outter fold, p is for inner fold print(ncol(data.frame)) flds <- createFolds(1:ncol(data.frame), k = k, list = TRUE, returnTrain = FALSE) auc = c() predict.values.df = data.frame(spring=c(), summer=c(), fall=c(), winter=c()) truth.values = c() #print(length(flds)) for (i in 1:length(flds)){#outter test.outter = data.frame[, c(flds[[i]])] truth.test.outter = truth.frame$season[c(flds[[i]])] #print(truth.test.outter) #print(flds[[i]]) train.outter = data.frame[, -c(flds[[i]])] truth.train.outter = truth.frame$season[-c(flds[[i]])] fraction = table(truth.train.outter)/length(truth.train.outter) weights = 1 - fraction[as.character(truth.train.outter)] #print(truth.train.outter) #print(dim(train.outter)) #cv.glmnet takes care of inner loop training cvglmfit = cv.glmnet(x = t(train.outter), y = factor(truth.train.outter), family = "multinomial", weights = weights) s0 = cvglmfit$lambda.1se #hyper-parameter #fit train dataset with given hyper-parameter glmfit = glmnet(x = t(train.outter), y = factor(truth.train.outter), family = "multinomial") #predict on testing dataset pred = predict(glmfit, t(test.outter), type = "class", s = s0) pred.values = predict(glmfit, t(test.outter), type = "response", s = s0) pred.values.df = data.frame(pred.values) colnames(pred.values.df) = c("spring", "summer", "fall", "winter") predict.values.df = rbind(predict.values.df, pred.values.df) truth.values=c(truth.values, truth.test.outter) confusion_table = table(predicted = pred, truth = factor(truth.test.outter)) #print(confusion_table) mROC = multiclass.roc(as.numeric(factor(truth.test.outter)), as.numeric(factor(pred[,1]))) #print(auc(mROC)) auc = c(auc, mROC$auc) #test model selected by inner to the outter test set }#outter #print(dim(predict.values.df)) #print(rownames(predict.values.df)) #print(truth.values) #this is not necessary as the truth remains to be the truth #reodering the dataframe to be consistent between resamplings. predict.values.df = predict.values.df[order(rownames(predict.values.df)),] #fit build model on whole dataset and testing on external dataset fraction = table(truth.frame$season)/length(truth.frame$season) weights = 1 - fraction[as.character(truth.frame$season)] cvglmfit.final = cv.glmnet(x = t(data.frame), y = factor(truth.frame$season), family = "multinomial", weights = weights) s0.model = cvglmfit.final$lambda.1se glmfit.final = glmnet(x = t(data.frame), y = factor(truth.frame$season), family = "multinomial") #use on external dataset print(paste0("the s0 used is ", s0.model)) pred.external = predict(glmfit.final, t(external.frame), type = "class", s=s0.model) confusion_table.external = table(predicted = factor(pred.external, levels=c("spring", "summer", "fall","winter")), truth = factor(external.truth$season)) print(confusion_table.external) metrics = check.model.performance(factor(pred.external, levels=c("spring", "summer", "fall","winter")),factor(external.truth$season)) #print(metrics) mROC.external = multiclass.roc(as.numeric(factor(external.truth$season)), as.numeric(factor(pred.external[,1]))) print(mROC.external$auc) return(list(auc = auc, metrics=metrics, predict.values=predict.values.df)) } #auc.result = cvnested(campushome.master.asinh.cpm, campushome, 10,5) performance.result.cv = replicate(10,cvnested(Mike_NAmerica.master.asinh.cpm, Mike_NAmerica, patient1.master.asinh.cpm, Patient1, 10,5)) performance.result.cv = replicate(2,cvnested(Mike_NAmerica.master.asinh.cpm, Mike_NAmerica, patient1.master.asinh.cpm, Patient1, 10,5)) #saveRDS(performance.result.cv, "performance.result.rds") #this combination seems to make sense so far, with mAUC around 0.7 for internal CV and external datasets. q = ggplot(data.frame(auc=unlist(c(performance.result.cv[1,]))), aes(x="glmnet", y=auc))+ geom_violin(width=0.5, fill="light blue",draw_quantiles = c(0.25, 0.5, 0.75))+ geom_quasirandom(alpha=0.6, width = 0.1 ) + ylim(c(0,1)) + annotate("text", x=1, y=0.3, label = paste0("mean mAUC = ", signif(mean(unlist(c(performance.result.cv[1,]))), 3))) + annotate("text", x=1, y=0.25, label = paste0("median mAUC = ", signif(median(unlist(c(performance.result.cv[1,]))), 3))) +scale_color_npg()+ ylab("mAUC") + theme_pubr() + theme( axis.text.x=element_blank()) + xlab("Glmnet") q = q + theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14)) #plotting other metrics, by MACRO average over classes wtf =lapply(performance.result.cv[2,], function(x){apply(x[,-1], 2, function(x) { mean(as.numeric(x))})}) wtf.matrix = matrix(unlist(wtf), nrow=10, byrow=TRUE, dimnames = list(c(1:10),c("Precison","Sensitivity","Accuracy", "Specificity", "F.score", "MCC"))) wtf.melt = melt(wtf.matrix) p = ggplot(wtf.melt, aes(x=Var2, y=value, color=Var2)) + geom_quasirandom(alpha=0.6, groupOnX=TRUE, bandwidth = 0.1, width=0.1, size=3) + theme_pubr() + theme(legend.title = element_blank()) + ylim(c(0,1))+ylab("Macro average value") +xlab("Performance metrics") + scale_color_npg() #+ geom_violin( draw_quantiles = c(0.25, 0.5, 0.75)) p = p +theme(axis.text=element_text(size=10), axis.text.x = element_blank(), axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14)) +theme(legend.position="top") ggplot_build(p)$data ggarrange(q, p, ncol = 2, widths=(c(2,2))) ``` ##ML for control/blank======== ```{r ML for control/blank} scores_single = scores[-grep("double|I15_2|I15_3", scores$samplenames, ignore.case=TRUE), ] master.asinh.cpm.cb = master.asinh.cpm[, scores_single$samplenames] #fit build model on whole dataset and testing on external dataset fraction = table(scores_single$Type)/length(scores_single$Type) weights = 1 - fraction[as.character(scores_single$Type)] cvglmfit.final = cv.glmnet(x = t(master.asinh.cpm.cb), y = factor(scores_single$Type), family = "binomial", weights = weights) s0.model = cvglmfit.final$lambda.1se glmfit.final = glmnet(x = t(master.asinh.cpm.cb), y = factor(scores_single$Type), family = "binomial") pred.cb = predict(glmfit.final, t(master.asinh.cpm.cb), type = "class", s=s0.model-0.08) confusionMatrix(data=pred.cb, reference=scores_single$Type, mode = "everything") ``` ##getting mean prediction values from the pooling prediction results of the testing data, which is generated by combining prediction results using model constructed by internal CV on each testing slice, each resampling will generate a new matrix. ```{r getting the mean prediction values} #initiating empty tibbles with rownames. spring=tibble(row.names = rownames(performance.result.cv[3,1]$predict.values)) summer=tibble(row.names = rownames(performance.result.cv[3,1]$predict.values)) fall=tibble(row.names = rownames(performance.result.cv[3,1]$predict.values)) winter=tibble(row.names = rownames(performance.result.cv[3,1]$predict.values)) for (i in 1:length(performance.result.cv[3,])){ temp.df = as.tibble(performance.result.cv[3,i]$predict.values) spring = cbind(spring, temp.df$spring) summer = cbind(summer, temp.df$summer) fall = cbind(fall, temp.df$fall) winter = cbind(winter, temp.df$winter) } #plot mean var relationship for probabilities par(mfrow=c(2,2)) plot(apply(spring[,-1], 1, mean), apply(spring[,-1], 1, var), xlab="mean", ylab="var", main="mean-variance relationship for spring predictions") plot(apply(summer[,-1], 1, mean), apply(summer[,-1], 1, var),xlab="mean", ylab="var",main="mean-variance relationship for summer predictions") plot(apply(fall[,-1], 1, mean), apply(fall[,-1], 1, var),xlab="mean", ylab="var",main="mean-variance relationship for fall predictions") plot(apply(winter[,-1], 1, mean), apply(winter[,-1], 1, var),xlab="mean", ylab="var",main="mean-variance relationship for winter predictions") par(mfrow=c(2,2)) plot(apply(spring[,-1], 1, mean), apply(spring[,-1], 1, median),xlab="mean", ylab="median",main="mean-median relationship for spring predictions") plot(apply(summer[,-1], 1, mean), apply(summer[,-1], 1, median), xlab="mean", ylab="median",main="mean-median relationship for summer predictions") plot(apply(fall[,-1], 1, mean), apply(fall[,-1], 1, median), xlab="mean", ylab="median",main="mean-median relationship for fall predictions") plot(apply(winter[,-1], 1, mean), apply(winter[,-1], 1, median), xlab="mean", ylab="median",main="mean-median relationship for winter predictions") #taking mean across the number of resamplings for each sample finalvalues = tibble(spring = apply(spring[,-1], 1, mean), summer = apply(summer[,-1], 1, mean), fall = apply(fall[,-1], 1, mean), winter = apply(winter[,-1], 1, mean)) #rename the row for the tibble row.names(finalvalues) = spring$row.names #generate truth vector according to the sample names finaltruth = Mike_NAmerica$season[match(row.names(finalvalues), Mike_NAmerica$samplenames)] #at this point, final values store a tibble for mean prediction scores for each sample across 4 seasons, finaltruth store their respective truth values ``` ##formal fitting, this step prepares for further steps ```{r formal fitting} #finding the best lambda set.seed(1230123) set.seed(seed = NULL) fraction = table(Mike_NAmerica$season)/length(Mike_NAmerica$season) weights = 1 - fraction[as.character(Mike_NAmerica$season)] #results = replicate(3, cv.glmnet(x = t(Mike_NAmerica.master.asinh.cpm), # y = factor(Mike_NAmerica$season), # family = "multinomial", weights = weights)) set.seed(seed = NULL) cvglmfit.final = cv.glmnet(x = t(Mike_NAmerica.master.asinh.cpm), y = factor(Mike_NAmerica$season), family = "multinomial", weights = c(weights)) #plot(cvglmfit) s0 = cvglmfit.final$lambda.1se print(s0) #good at 0.053 glmfit = glmnet(x = t(Mike_NAmerica.master.asinh.cpm), y = factor(Mike_NAmerica$season), family = "multinomial", intercept=FALSE) #pred.internal.rawvalues = predict(glmfit, t(Mike_NAmerica.master.asinh.cpm), # type = "response", s = s0) #provides original values of predictions for each category pred.internal = predict(glmfit, t(Mike_NAmerica.master.asinh.cpm), type = "class", s = s0) pred.internal.values = predict(glmfit, t(Mike_NAmerica.master.asinh.cpm), type = "response", s = s0) pred.external = predict(glmfit, t(patient1.master.asinh.cpm), type = "class", s = s0) pred.external.values = predict(glmfit, t(patient1.master.asinh.cpm), type = "response", s = s0) pred.fourpeople = predict(glmfit, t(four.master.asinh.cpm), type="class", s=s0) #internal fitting, sanity check confusion_table.internal = table(predicted = pred.internal, truth = factor(Mike_NAmerica$season)) confusion_table.internal confusionMatrix(data=factor(pred.internal, levels=c("spring", "summer", "fall","winter")), reference=factor(Mike_NAmerica$season), mode = "everything") mROC.internal = multiclass.roc(as.numeric(factor(Mike_NAmerica$season)), as.numeric(factor(pred.internal, levels=c("spring", "summer", "fall", "winter")))) mROC.internal$auc #external validation confusion_table.external = table(factor(pred.external, levels=c("spring", "summer", "fall","winter")), truth = factor(Patient1$season)) confusion_table.external ##this fucntion also provides a bunch of performance statistics reference.result = confusionMatrix(data=factor(pred.external, levels=c("spring", "summer", "fall","winter")), reference=factor(Patient1$season), mode = "everything") reference.result #uses the performance function defined in last section, self-written function performance.result = check.model.performance(factor(pred.external, levels=c("spring", "summer", "fall","winter")),factor(Patient1$season)) apply(performance.result[,-1], 2, function(x) {mean(as.numeric(x))}) mROC = multiclass.roc(as.numeric(factor(Patient1$season)), as.numeric(factor(pred.external[,1]))) mROC$auc #four people data confusion_table.fourpeople = table(predicted = pred.fourpeople, truth = factor(fourpeople$season)) confusion_table.fourpeople confusionMatrix(data=factor(pred.fourpeople, levels=c( "fall","winter")), reference=factor(fourpeople$season)) mROC.fourpeople = multiclass.roc(as.numeric(factor(fourpeople$season)), as.numeric(factor(pred.fourpeople[,1]))) mROC.fourpeople #RF for fun model = randomForest(x=t(Mike_NAmerica.master.asinh.cpm),y=factor(Mike_NAmerica$season)) pred.rf = predict(model,newdata=t(Mike_NAmerica.master.asinh.cpm),type='class')#[,'TRUE',drop=TRUE] confusion_table.rf = table(predicted = pred.rf, truth = factor(Mike_NAmerica$season)) confusion_table.rf pred.rf.external = predict(model,newdata=t(patient1.master.asinh.cpm),type='class') confusion_table.rf.external = table(predicted = pred.rf.external, truth = factor(Patient1$season)) confusion_table.rf.external #### ##this is to fine tune the model on external dataset, it seems s0=0.055 is great for BOTH internal and EXTERNAL datasets. This practice is over-optimistic if(FALSE){ for (i in seq(0.01, 0.06, 0.003)){ print(i) pred.external = predict(glmfit, t(patient1.master.asinh.cpm), type = "class", s = i) confusion_table.external = table(predicted = pred.external, truth = factor(Patient1$season)) mROC = multiclass.roc(as.numeric(factor(Patient1$season)), as.numeric(factor(pred.external[,1]))) print(mROC$auc) } } plot(glmfit, xvar = "norm", col = RColorBrewer::brewer.pal(12, "Set3"), lwd = sqrt(3)) # standardize coefficients, is this necessary?? #if X is the input matrix of the glmnet function, # and cv.result is your glmnet object: sds <- apply(t(Mike_NAmerica.master.asinh.cpm), 2, sd) coefs <- as.matrix(coef(glmfit, s = s0)[["spring"]]) std_coefs <- coefs[-1, 1] / sds ``` ##multiclass auc ```{r multiclass auc} #use bind_cols and bind_rows for tibbles cal_auc = function(finalvalues, finaltruth, zero_label, one_label){ #zero_label="spring" #one_label="summer" colnames(finalvalues) = c("spring", "summer", "fall", "winter") final.df = bind_cols(truth=finaltruth, finalvalues) points = tibble(label=character(), prob=double()) for (i in 1:dim(final.df)[1]){ if (final.df$truth[i] == zero_label | final.df$truth[i] == one_label){ #print(as.character(final.df$truth[i]))# #print(as.double(final.df[i, zero_label])) points = bind_rows(points, tibble(label = as.character(final.df$truth[i]), prob = as.numeric(final.df[i, zero_label]))) } } #print(dim(points)) colnames(points) = c("label", "zero_label_prob") #sorted by prob values, then we get ranks points = points[order(points$zero_label_prob),] #print(points) #class(points) n0 = 0 n1 = 0 sum_ranks = 0 #i is rank here for (i in 1:dim(points)[1]){ if (points$label[i] == zero_label) { n0 = n0 + 1 sum_ranks = sum_ranks + i } if (points$label[i] == one_label){ n1 = n1 + 1 } } #print(n0) #print(n1) #print(sum_ranks) return((sum_ranks - (n0*(n0+1)/2.0)) / (n0 * n1)) } cal_auc(finalvalues, finaltruth, "spring", "summer") #function generating all possible combinations from a vector, store in data.frame combinations = function(vector){ combos = tibble(a=character(), b=character()) for (i in 1:(length(vector)-1)){ for (j in (i+1):length(vector)){ #slice = c(as.character(vector[i]), as.character(vector[j])) #names(slice) = c("a", "b") #print(vector[j]) combos = bind_rows(combos, c(a = as.character(vector[i]), b= as.character(vector[j]))) } } #print(combos) return(combos) } combos = combinations(seasons) #calculating mAUC values #combos is the combinations between two labels cal_mAUC = function(finalvalues, finaltruth, combos){ colnames(finalvalues) = c("spring", "summer", "fall", "winter") sums = 0 mAUC = 0 for (i in 1:dim(combos)[1]){ sums = sums + (cal_auc(finalvalues, finaltruth, combos$a[i], combos$b[i]) + cal_auc(finalvalues, finaltruth, combos$b[i], combos$a[i]))/2 } mAUC = sums * 2 / (length(seasons) * (length(seasons)-1)) return(mAUC) } #this mAUC is calcualted from the pooling result of internal cv model on testing data, no external data is used #this is a relatively conservative measurement of the model performance on the training dataset cal_mAUC(finalvalues, finaltruth, combos) #for external validation dataset cal_mAUC(data.frame(pred.external.values), factor(Patient1$season), combos) #internal test cal_mAUC(data.frame(pred.internal.values), factor(Mike_NAmerica$season), combos) ``` ##multiclass roc plotting one vs all ```{r multiclass roc plotting one vs all} #one vs all method cal_roc_ova = function(finalvalues, finaltruth, zero_label){ #zero_label="spring" #one_label="summer" colnames(finalvalues)=c("spring","summer","fall","winter") final.df = bind_cols(truth=finaltruth, finalvalues) points = data.frame(label=character(), prob=double()) for (i in 1:dim(final.df)[1]){ if (final.df$truth[i] == zero_label){ #print(as.character(final.df$truth[i]))# #print(as.numeric(final.df[i, zero_label])) points = rbind(points, c(label = as.character(final.df$truth[i]), prob = as.numeric(final.df[i, zero_label]))) } else { points = rbind(points, c(0, prob = as.numeric(final.df[i, zero_label]))) }} #print(dim(points)) colnames(points) = c("label", "zero_label_prob") #sorted by prob values, then we get ranks points = points[order(points$zero_label_prob),] points$label=factor(points$label, levels =c(zero_label, 0)) points$zero_label_prob = as.numeric(points$zero_label_prob) return(points) } #points.ova = cal_roc_ova(finalvalues, finaltruth, "spring") #ggroc(roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, levels=c(0, "spring"))) #ggroc(roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, levels=c("spring", 0))) plots.ova=list() seasons = factor(seasons, levels=c("spring", "summer", "fall", "winter")) for (i in seasons){ points.ova = cal_roc_ova(finalvalues, finaltruth, i) #print(ggroc(roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, levels=c(0, i)))) print(roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, levels=c(0, i))) plots.ova[[i]] = roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, ci.type="shape", levels=c(0, i)) } ggroc(plots.ova, linetype=7, size=1.2, alpha=0.8) + geom_abline(slope=1, intercept=1,color="light grey") + theme_bw() + scale_colour_manual(values = c('spring' = "#00A087FF", 'summer' = "#4DBBD5FF", 'fall' = "#E64B35FF", 'winter' = "#3C5488FF" ), name = "Seasons") + annotate("text", label=paste0("AUC=",signif(plots.ova$fall$auc,3), " 95%CI: 0.487-0.661"), color="#E64B35FF", x= 0.22, y=0.15, size=4) + annotate("text", label=paste0("AUC=",signif(plots.ova$spring$auc,3), " 95%CI: 0.747-0.889"), color="#00A087FF", x= 0.22, y=0.10, size =4) + annotate("text", label=paste0("AUC=",signif(plots.ova$summer$auc,3), " 95%CI: 0.712-0.860"), color="#4DBBD5FF", x= 0.22, y=0.05, size =4)+ annotate("text", label=paste0("AUC=",signif(plots.ova$winter$auc,3), " 95%CI: 0.810-0.930"), color="#3C5488FF", x= 0.22, y=0.00, size =4)+ labs( x = "Specificity (1-FPR)", y = "Sensitivity (TPR)", size =5) + theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14)) #ROC plots for validation dataset plots.ova.external=list() for (i in seasons){ points.ova.external = cal_roc_ova(data.frame(pred.external.values), factor(Patient1$season), i) #print(ggroc(roc(points.ova$label, points.ova$zero_label_prob, auc=TRUE, ci=TRUE, levels=c(0, i)))) print(roc(points.ova.external$label, points.ova.external$zero_label_prob, auc=TRUE, ci=TRUE, levels=c(0, i))) plots.ova.external[[i]] = roc(points.ova.external$label, points.ova.external$zero_label_prob, auc=TRUE, ci=TRUE, ci.type="shape", levels=c(0, i)) } ggroc(plots.ova.external, linetype=7, size=1.2, alpha=0.8) + geom_abline(slope=1, intercept=1,color="light grey") + theme_bw() + scale_colour_manual(values = c('spring' = "#00A087FF", 'summer' = "#4DBBD5FF", 'fall' = "#E64B35FF", 'winter' = "#3C5488FF" ), name = "Seasons") + annotate("text", label=paste0("AUC=",signif(plots.ova$fall$auc,3), " 95%CI: 0.5681-0.930"), color="#E64B35FF", x= 0.22, y=0.15, size=4) + annotate("text", label=paste0("AUC=",signif(plots.ova$spring$auc,3), " 95%CI: NaN-NaN"), color="#00A087FF", x= 0.22, y=0.10, size =4) + annotate("text", label=paste0("AUC=",signif(plots.ova$summer$auc,3), " 95%CI: 0.712-0.860"), color="#4DBBD5FF", x= 0.22, y=0.05, size =4)+ annotate("text", label=paste0("AUC=",signif(plots.ova$winter$auc,3), " 95%CI: 0.676-0.975"), color="#3C5488FF", x= 0.22, y=0.00, size =4)+ labs( x = "Specificity (1-FPR)", y = "Sensitivity (TPR)", size =5) + theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14)) ``` ##multiclass roc plotting pairs ```{r multiclass roc plotting pairs} #every pair method cal_roc = function(finalvalues, finaltruth, zero_label, one_label){ #zero_label="spring" #one_label="summer" final.df = bind_cols(truth=finaltruth, finalvalues) points = data.frame(label=character(), prob=double()) for (i in 1:dim(final.df)[1]){ if (final.df$truth[i] == zero_label | final.df$truth[i] == one_label){ #print(as.character(final.df$truth[i]))# #print(as.numeric(final.df[i, zero_label])) points = rbind(points, c(label = as.character(final.df$truth[i]), prob = as.numeric(final.df[i, zero_label]))) } } #print(dim(points)) colnames(points) = c("label", "zero_label_prob") #sorted by prob values, then we get ranks points = points[order(points$zero_label_prob),] points$label=factor(points$label, levels =c(zero_label, one_label)) points$zero_label_prob = as.numeric(points$zero_label_prob) return(points) } plots=list() for (i in 1:dim(combos)[1]){ #combos is defined in last section i=2 points.a = cal_roc(finalvalues, finaltruth, combos$a[i], combos$b[i]) points.b = cal_roc(finalvalues, finaltruth, combos$b[i], combos$a[i]) points.roc.a = roc(points.a$label, points.a$zero_label_prob, auc=TRUE, ci=TRUE) points.roc.b = roc(points.b$label, points.b$zero_label_prob, auc=TRUE, ci=TRUE) print(ggroc(list(plota = points.roc.a, plotb=points.roc.b))+ggtitle(paste0(combos$a[i],"-", combos$b[i]))) } ``` ##feature extraction================ ```{r glmnet final fitting and feature extraction} #for further plotting of features #taxons with a arbitrary absolute value threshold #use names(selecttaxons$season) to gain access to the names of taxons for respective seasons selecttaxons = list(spring =c(), summer=c(), fall=c(), winter=c()) #weights calcualted by percentage of sum of absolute coefficients(weight) weights = list(spring =c(), summer=c(), fall=c(), winter=c()) for (se in c("spring", "summer", "fall", "winter")){ se = "summer" coefs = coef(glmfit)[[se]][, which.min(abs(glmfit$lambda - s0))] coefs = sort(coefs, decreasing= TRUE) print(se) print(head(coefs,4)) print(tail(coefs,4)) weights[[se]] = c(abs(coefs[coefs != 0])/sum(abs(coefs[coefs != 0]))*100) selecttaxons[[se]] = c(coefs[abs(coefs) >0.08]) if(TRUE){ for (i in 1:4){ if (names(coefs)[i]!=""){ m = ggplot(as.data.frame(t(Mike_NAmerica.master.asinh.cpm)), aes_string(x=Mike_NAmerica$date.mid, y=Mike_NAmerica.master.asinh.cpm[names(coefs)[i],])) m1 = m + geom_point(aes(color=Mike_NAmerica$season)) + ylab(paste0(names(coefs)[i], " (normalized abundance)")) + theme_bw() + theme(legend.position="bottom", legend.title = element_blank()) + xlab("Date") + ggtitle(paste0(se, " positively contributing taxon, ", "coefficient = ", signif(coefs[i],3))) + scale_color_npg() print(m1) } } for (i in 1:4){ if (names(coefs)[length(coefs)+1-i]!=""){ n = ggplot(as.data.frame(t(Mike_NAmerica.master.asinh.cpm)), aes_string(x=Mike_NAmerica$date.mid, y=Mike_NAmerica.master.asinh.cpm[names(coefs)[length(coefs)+1-i],])) n1 = n + geom_point(aes(color=Mike_NAmerica$season)) + ylab(paste0(names(coefs)[length(coefs)+1-i], " (normalized abundance)")) + theme_bw() + theme(legend.position="bottom", legend.title = element_blank()) + xlab("Date") + ggtitle(paste0(se, " negatively contributing taxon, ", "coefficient = ", signif(coefs[length(coefs)+1-i],3)))+ scale_color_npg() print(n1) } } } } ``` ##heatmap for features==================== ```{r feature-heatmap} setwd("~/Documents/Bioinfo/DNAformal/heatmap") nameslist = c() for (i in 1:4){ nameslist = c(nameslist, selecttaxons[[i]]) } Mike_NA.heatmap.asinh.cpm = Mike_NAmerica.master.asinh.cpm[names(nameslist),] Mike_NA.heatmap.asinh.cpm[Mike_NA.heatmap.asinh.cpm==0] =NA Mike_NAmerica.season = Mike_NAmerica[order(Mike_NAmerica$season),] Mike_NA.heatmap.asinh.cpm.season = Mike_NA.heatmap.asinh.cpm[, Mike_NAmerica.season$samplenames] #df= data.frame(t(Mike_NA.heatmap.asinh.cpm), season=Mike_NAmerica$season) #df.melt = melt(df, id.vars = "season") #df.melt.scaled <- ddply(df.melt, .(variable), transform, rescale = rescale(value)) #tile = ggplot(df.melt.scaled, aes(x=season, y=variable, fill=value)) + geom_tile() #tile plot_aheatmap_genus = function(x, annotate.df, prefix="",main=NULL){ #-------------- # goi.obj: ExpressionSet object containg expression data(log cpm) of gene of interests and phenoData # main: title of plot #--------------- if (is.null(main)) main = "" wval = 12 hval = 15 # add color to col and row # CellType = rownames(x) # ann.col = data.frame("Humid" = annotate.df$X_Mean_Humidity, "Temperature" = annotate.df$Min_TemperatureC, "Particle" = annotate.df$total.particle, "Weekend" = annotate.df$weekend, "Season" = annotate.df$season, "Batch" = annotate.df$batch, rain = as.factor(annotate.df$is_there_rain), Precipitation = annotate.df$Precipitationmm) ann.col = data.frame(Season = annotate.df$season, "Humid" = annotate.df$median.humid, "Temperature" = annotate.df$median.temperature, "Particle" = annotate.df$total.particle) ann.row = data.frame(Season = c(rep("spring", length(selecttaxons[["spring"]])), rep("summer", length(selecttaxons[["summer"]])),rep("fall", length(selecttaxons[["fall"]])),rep("winter", length(selecttaxons[["winter"]])))) #color=colorRampPalette(c("#ffeda0", "#feb24c", "#f03b20"))(3) color = "-RdBu:50" #color = "-RdYlBu2:100" #annotation color, need to match level names and color value!!! seasoncolor = c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF") names(seasoncolor) = c("spring", "summer", "fall", "winter") #Var2 = c("lightgreen", "navy") ann_colors = list(Season = seasoncolor) # choose dist and hc methods distfunvals = c("euclidean", "pearson", "spearman") hclustfunvals = c("centroid", "average", "complete") distfunval = distfunvals[1] hclustfunval = hclustfunvals[2] #scale="row" # scale by row # NA is different from FALSE for Colv and Rowv, FALSE means clustering is still calcuated just not shown. NA means not doing anything # change Colv = NA for timelapse Colv = NA # add dendrogram to col Rowv = NA # add dendrogram to row filePrefix = paste0(prefix,"_") fileName = paste(filePrefix, distfunval, "_", hclustfunval, ".pdf", sep="") fileName = paste(format(Sys.time(), "%d_%b_%Y"),fileName,sep="_") #pdf(file.path(getwd(), fileName), width=wval, height=hval) #labcol = as.character(annotate.df$date.mid) #aheatmap(x, color = color,distfun = distfunval, hclustfun = hclustfunval,fontsize = 8,main=fileName, filename=fileName) #set cellwidth = 3 for large number of samples aheatmap(x, cellwidth=3, cellheight = 6, distfun = distfunval, hclustfun = hclustfunval, labCol=NA, scale= "row", annCol = ann.col, annRow= ann.row, Colv=Colv,Rowv=Rowv, annColors = ann_colors, color = color, fontsize = 8, filename=fileName, layout = "|.L*") # main = main) } plot_aheatmap_genus(x=Mike_NA.heatmap.asinh.cpm.season,annotate.df = Mike_NAmerica.season, prefix=paste0(mole.type,"-", taxonlevel), main=NULL) #====barplot for features ann.row = data.frame(season = c(rep("spring", length(selecttaxons[["spring"]])), rep("summer", length(selecttaxons[["summer"]])),rep("fall", length(selecttaxons[["fall"]])),rep("winter", length(selecttaxons[["winter"]])))) ann.row$season = factor(ann.row$season, levels=rev(c("spring", "summer", "fall", "winter"))) ann.row$names = names(nameslist) ann.row$coe = nameslist dotchart = ggdotchart(ann.row, x = rev("names"), y = "coe", color = "season", # Color by groups palette = rev(c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")), # Custom color palette sorting = "descending", # Sort value in descending order add = "segments", # Add segments from y = 0 to dots add.params = list(size = 3), rotate = TRUE, # Rotate vertically group = "season", # Order by groups dot.size = 6, # Large dot size xlab=FALSE, ylab="Coefficients", ggtheme = theme_pubr() # ggplot2 theme ) + geom_hline(yintercept = 0, linetype = 2, color = "lightgray") #parsing weights information selectweights = c() for (i in c("spring", "summer", "fall", "winter")){ selectweights = c(selectweights, weights[[i]][names(selecttaxons[[i]])]) } weights.df = data.frame(season = ann.row$season, names = names(selectweights), weights=as.numeric(selectweights), value=as.numeric(c(rep(1, length(selectweights))))) #for some weird reasons ggplot will plot text x-axis in a sorting order even when not asked weights.df$names = factor(weights.df$names, levels = rev(weights.df$names)) weights.plot= ggplot(weights.df, aes(x=names, y=value, label= paste0(round(weights.df$weights, 0),"%"))) + geom_text(aes(color=season)) + ylab("Percentage of total absolute weight") + theme_classic2() + scale_color_manual(values=rev(c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"))) + theme(axis.ticks=element_blank(), axis.line =element_blank(), axis.text.x = element_blank()) + coord_flip() ggplot_build(weights.plot)$data #plots for selected taxa within the model to justfiy feature selection #reset to 0 so violin plots would be useful Mike_NA.heatmap.asinh.cpm.season[is.na(Mike_NA.heatmap.asinh.cpm.season)] = 0 Mike.heatmap.df = cbind(t(Mike_NA.heatmap.asinh.cpm.season), Mike_NAmerica.season) ggplot(Mike.heatmap.df, aes(x=season, y=Azadirachta, color=season)) + geom_violin(draw_quantiles = c(0.25,0.5,0.75)) + geom_jitter(width=0.1) + theme_classic2() + scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) #ggboxplot(Mike.heatmap.df, x = "season", y = "Azadirachta", # color = "season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), # add = "jitter", legend="bottom") #saving all plots to list violin.plots = list() for (name in names(nameslist[-6])){ name violin = ggviolin(Mike.heatmap.df, x = "season", y = as.character(name), fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), add = "boxplot", add.params = list(width=0.1, fill = "white"), quantile(c(0.25, 0.5, 0.75)), legend="bottom", trim=TRUE, xlab=FALSE, tickslab=FALSE) + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif", hide.ns=TRUE) print(violin) violin.plots[[name]] = ggpar(violin, tickslab =FALSE) } ggarrange(plotlist = violin.plots, ncol=3, nrow=10) ``` ##picking seasonal plots ```{r pickseasonal plots} my_comparisons <- list( c("spring", "summer"), c("spring", "fall"), c("spring", "winter"), c("summer", "fall" ), c("summer", "winter"), c("fall", "winter")) my_comparisons <- list( c("spring", "summer"), c("spring", "fall"), c("spring", "winter")) violin1 = ggviolin(Mike.heatmap.df, x = "season", y = "Azadirachta", fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "horizontal") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif") +theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14)) # Add global the p-value violin1 violin2 = ggviolin(Mike.heatmap.df, x = "season", y = "Fomitiporia", fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "horizontal") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif") +theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14)) violin2 my_comparisons <- list( c("spring", "summer"), c("summer", "fall"), c("summer", "winter")) violin3 = ggviolin(Mike.heatmap.df, x = "season", y = "Sclerotinia", fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "horizontal") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif") +theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14)) # Add global the p-value violin3 my_comparisons <- list( c("spring", "fall"), c("summer", "fall" ), c("fall", "winter")) violin4 = ggviolin(Mike.heatmap.df, x = "season", y = "Tuber", fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "horizontal") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif") +theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14)) # Add global the p-value violin4 my_comparisons <- list( c("spring", "winter"), c("summer", "winter"), c("fall", "winter")) violin5 = ggviolin(Mike.heatmap.df, x = "season", y = "Tricholoma", fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "horizontal") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif") +theme(axis.text=element_text(size=12), axis.title=element_text(size=14,face="bold"), legend.text=element_text(size=14)) # Add global the p-value violin5 ggarrange(violin1 + rremove("y.text"), violin3 + rremove("y.text"), violin4 + rremove("y.text"), violin5 + rremove("y.text"), ncol=2, nrow=2) ``` ```{r glmnet} glmfit = glmnet(x = t(Mike.master.asinh.cpm), y = factor(Mike_single$season), family = "multinomial") pred = predict(glmfit, t(Mike.master.asinh.cpm), type = "class", s = 0.107) confusion_table = table(predicted = pred, truth = factor(Mike_single$season)) confusion_table ``` #Specific taxons plot ```{r specific taxon plot} #taxons list is for taxons of special interests. There are no real needs to use it taxonplot = function(profile, taxons=c(), top = 10, text.size=4, base.size=14){ #ok = scores.snap #text.size = 2 #ok = scores_single_complete #profile = scores_single_complete profile = data.frame(profile) master.cpm.snap = log(master.cpm.single[, profile$samplenames]+1) master.cpm.snap[master.cpm.snap == 0] = NA ori.names = rownames(master.cpm.snap) new.names = make.names(ori.names) rownames(master.cpm.snap) = new.names master.combined = cbind.data.frame(t(master.cpm.snap), profile) master.combined$date.month = factor(master.combined$date.month, levels = c(3:12, 1, 2)) #my_comparisons <- list( c("spring", "summer"), c("spring", "fall"), c("spring", "winter"), c("summer", "fall" ), c("summer", "winter"), c("fall", "winter")) my_comparisons <- list( c("spring", "summer"), c("summer", "fall"), c("fall", "winter")) orinames = unique(c(taxons, head(ori.names, top))) #used for annotation taxonlist = unique(c(make.names(taxons), head(new.names,top))) #used for dataframe .e <- environment() plot.list = list() for (i in 1:length(taxonlist)) local({ plot1 = ggviolin(master.combined, x = "season", y = taxonlist[i], fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "vertical") + geom_point(size=0.5) + theme(legend.position="none") + stat_compare_means(comparisons = my_comparisons, label = "p.signif") + stat_compare_means(label.x.npc = "left", label.y = 17) + #, method = "t.test") stat_smooth(aes(group=1)) +theme_pubr(base_size = 14) + rremove("x.text") + rremove("xlab") + ylab(orinames[i]) plot1$plot_env = .e print(plot1) plot2 = ggviolin(master.combined, x = "date.month", y = taxonlist[i], fill ="season", palette =c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF"), add = "boxplot", add.params = list(width=0.1, fill = "white"), legend="bottom", trim=TRUE, ylab=FALSE, orientation = "vertical") + geom_point(size=0.5) + theme(legend.position="none") + stat_smooth(aes(group=1))+ stat_compare_means(label.x.npc = "center", label.y.npc = "top") + theme_pubr(base_size = 14) + rremove("legend") + ylab("") + xlab("") plot2$plot_env = .e print(plot2) plot3 = ggarrange(plot1, plot2, ncol = 2, common.legend = TRUE) #print(plot3) plot.list[[taxonlist[i]]] <<- plot1 plot.list[[paste0(taxonlist[i],".month")]] <<- plot2 plot.list[[paste0(taxonlist[i],".both")]] <<- plot3 }) return(plot.list) } scores.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2016-08-30", "%Y-%m-%d" )) taxon.plot = taxonplot(scores.snap, taxons = c("Stereum", "Pinus", "Propionibacterium", "Aspergillus", "Penicillium", "Bacillus", "Staphylococcus" )) taxon.plot = taxonplot(scores.snap, taxons = c("Stereum hirsutum", "Pinus taeda", "Propionibacterium acnes", "Aspergillus fumigatus", "Trametes versicolor", "Botrytis cinerea", "Melampsora pinitorqua", "Acinetobacter baumannii", "Penicillium citrinum", "Malassezia restricta"), base.size = 18, top=20) taxon.plot = taxonplot(scores.snap, taxons = c("Flavobacteriaceae bacterium 3519-10", "Flavobacterium psychrophilum", "Flavobacterium subsaxonicum"), base.size = 18, top=5) ``` #Taxon seasonal trends plots====================== ```{r misc plots} #loop through top 50 taxonlevels create individual plots if(FALSE){ for (i in 20:70){ #i=20 temp.df = subset(master_combined_complete, master_combined_complete[,i] >0) p = ggplot(temp.df, aes_string(x="season", y=log(temp.df[,i]), group="season", color="season")) p + geom_boxplot(alpha=0.7, outlier.alpha=0) + geom_jitter(size=1, width=0.2) + geom_smooth(aes(group=1),se=FALSE) + ggtitle(colnames(master_combined_complete)[i]) + ylab("log(CPM)") + xlab("Season") #ggsave(paste0("./seasonal/DNA-",taxonlevel,"-",colnames(master_combined_complete)[i],"-season.pdf")) p = ggplot(temp.df, aes(x=date.month, y=log(temp.df[,i]), group=date.month, color=season)) p + geom_boxplot(alpha=0.7, outlier.alpha=0) + geom_jitter(size=1, width=0.2) + geom_smooth(aes(group=1),se=FALSE) + scale_x_continuous(breaks=c(1:12)) + xlab("Month") + ylab("log(CPM)") + ggtitle(colnames(master_combined_complete)[i]) #ggsave(paste0("./seasonal/DNA-",taxonlevel,"-",colnames(master_combined_complete)[i],"-month.pdf")) } } #USE THIS!!!!!!!!!!! #facet_wrap plots, this is the better way to represent them in groups select = c(136:155)#top 20 temp.df = melt(master_combined_complete,id.vars=colnames(master_combined_complete)[1:19],measure.vars=colnames(master_combined_complete)[select]) temp.df = subset(temp.df,value >0) #every 20 p = ggplot(temp.df, aes(x=season, y=log(value), group=season, color=season)) p + geom_boxplot(alpha=0.7, outlier.alpha=0) + geom_beeswarm(size=0.1) + geom_smooth(aes(group=1),se=FALSE) + stat_compare_means(hide.ns = TRUE, label = "p.format") + ggtitle("") + ylab("log(CPM)") + xlab("Season") + facet_wrap( ~ variable,scales="free") + theme_pubr(x.text.angle = 45) + rremove("x.text")+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) select = c(108:116) #these are the column numbers of the first 9 phyla special.select = c("Basidiomycota", "Ascomycota", "Streptophyta","Firmicutes","Proteobacteria","Actinobacteria", "Bacteroidetes", "Chordata", "Arthropoda") temp.df = melt(master_combined_complete,id.vars=colnames(master_combined_complete)[1:19],measure.vars=special.select) #for P1 #temp.df = melt(P1_combined_complete,id.vars=colnames(P1_combined_complete)[1:19],measure.vars=special.select) #temp.df = subset(temp.df,value >0) #top 9 p = ggplot(temp.df, aes(x=season, y=log(value), group=season, fill=season)) p + geom_boxplot(alpha=0.7, outlier.alpha=0.2) + geom_beeswarm(size=0.5, alpha=0.5, show.legend= FALSE, color="black") + geom_smooth(aes(group=1),se=FALSE,show.legend = FALSE) + stat_compare_means(hide.ns = TRUE, label = "p.format", show.legend = FALSE) + ggtitle("") + ylab("log(CPM)") + xlab("") + facet_wrap( ~ variable,scales="free", ncol = 3) + theme_pubr(base_size = 14, x.text.angle = 45) + rremove("x.text")+ scale_fill_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) #formal test p.result = compare_means( formula = value ~ season, data = temp.df, group.by = "variable", method = "kruskal.test") #first separated by seasons, then separate by weekday/weekend p = ggplot(temp.df, aes(x=season, y=log(value), fill=weekend)) #p = ggplot(temp.df, aes(x=season, y=log(value), group=season, color=season)) p + geom_boxplot(alpha=0.7, outlier.alpha=0) + geom_beeswarm(size=0.1) + geom_smooth(aes(group=1),se=FALSE) + ggtitle("") + ylab("log(CPM)") + xlab("Season") + facet_wrap( ~ variable,scales="free") + theme_pubr(x.text.angle = 45) + rremove("x.text")+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) #proving seasons are different fit = lm(log(Streptophyta) ~ season, subset(master_combined_complete, Streptophyta >0)) summary(fit) anova(fit) #p = ggplot(subset(master_combined_complete, Corynebacterium >0), aes(x=drywet, y=log(Corynebacterium), group=drywet, color=season)) #p + geom_boxplot() + geom_jitter(size=1, width=0.2) + geom_smooth(aes(group=1),se=FALSE) ``` #regression model fitting for individual taxons taxons=========================linear regression model fitting ```{r linear regression model} mikespecial = data.frame(t(Mike.master.cpm), Mike) mikespecial = mikespecial[complete.cases(mikespecial),] lm.df = master_combined_complete lm.df = data.frame(t(patient1.master.cpm), Patient1) #for only patient lm.df = data.frame(t(campushome.master.cpm), campushome) #for campushome data lm.df = data.frame(t(Mike.master.cpm), Mike) # for all mike samples lm.df = lm.df[complete.cases(lm.df),] # 0s are not considered #It appears that for different species, the variables that explain the variances the most change as well. #current metadata, temperature, humidity, particle, season, weekend #it seems best model is different for different genus #full model m.viri = lm(log(Viridiplantae) ~ season + particle, data = subset(mikespecial, Viridiplantae>0)) m.Fungi = lm(log(Fungi) ~ season + particle + humid +temperature, data = subset(mikespecial, Fungi>0)) m.Bacteria = lm(log(Bacteria) ~ season + humid + temperature, data = subset(mikespecial, Bacteria>0)) m.Metazoa = lm(log(Metazoa) ~ particle + humid, data = subset(mikespecial, Metazoa>0)) m.Viruses = lm(log(Viruses) ~ season + particle + humid + temperature, data = subset(mikespecial, Viruses>0)) #simplified model m = lm(log(Pinus) ~ humid + particle + season, data = subset(mikespecial, Pinus > 0)) m = lm(log(Pinus.taeda) ~ particle + season, data = subset(mikespecial, Pinus.taeda > 0)) m = lm(log(Propionibacterium) ~ season + particle + humid + temperature + weekend, data = subset(mikespecial, Propionibacterium>0)) # 0s are not considered #It appears that for different species, the variables that explain the variances the most change as well. #current metadata, temperature, humidity, particle, season, weekend #it seems best model is different for different genus #full model totalspecial = data.frame(t(master.cpm.single), scores_single_complete) totalspecial = totalspecial[complete.cases(totalspecial),] #for all samples, everyone pooled m.viri = lm(log(Viridiplantae) ~ season + particle + humid + temperature, data = subset(totalspecial, Viridiplantae>0)) m.Fungi = lm(log(Fungi) ~ season + particle + humid +temperature, data = subset(totalspecial, Fungi>0)) m.Bacteria = lm(log(Bacteria) ~ season + particle + humid + temperature, data = subset(totalspecial, Bacteria>0)) m.Metazoa = lm(log(Metazoa) ~ season + particle + humid + temperature, data = subset(totalspecial, Metazoa>0)) m.Viruses = lm(log(Viruses) ~ season + particle + humid + temperature, data = subset(totalspecial, Viruses>0)) m = lm(log(Bacteria) ~ season + particle + humid + temperature + weekend, data = subset(lm.df, Bacteria>0)) m = lm(log(Fungi) ~ season + particle + humid + temperature, data = subset(lm.df, Fungi>0)) m2 = lm(log(Fungi) ~ humid + season, data = subset(lm.df, Fungi>0)) m = lm(log(Streptophyta) ~ humid + temperature + season + particle + weekend, data = subset(lm.df, Streptophyta>0)) m2 = lm(log(Streptophyta) ~ humid + season + temperature, data = subset(lm.df, Streptophyta>0)) m = lm(log(Firmicutes) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Firmicutes>0)) m = lm(log(Basidiomycota) ~ season + particle + humid + temperature + weekend, data = subset(lm.df, Basidiomycota>0)) m = lm(log(Ascomycota) ~ season + particle + humid + temperature + weekend, data = subset(lm.df, Ascomycota>0)) m = lm(log(Actinobacteria) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Actinobacteria>0)) m = lm(log(Proteobacteria) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Proteobacteria>0)) m = lm(log(Arthropoda) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Arthropoda>0)) m = lm(log(Bacteroidetes) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Bacteroidetes>0)) m = lm(log(Chlorophyta) ~ season + particle + temperature + humid + weekend, data = subset(lm.df, Chlorophyta>0)) m = lm(log(Viridiplantae) ~ season + particle + humid + temperature, data = subset(mikespecial, Viridiplantae>0)) m = lm(log(Fungi) ~ season + particle + humid +temperature, data = subset(mikespecial, Fungi>0)) #one can compare the models directly using anova functions, such like anova(m1,m2) to see if they are siginificantly different. ``` #PCA plots with features======================PCA all plots ```{r PCA plots all} pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=ownership)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) +scale_colour_brewer(palette = "Paired") + theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_ownership_all.pdf")) #scale_colour_gradient2 uses two color for diverging color scheme pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=duration)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient2(midpoint=8, low="blue", mid="white", high="red", space = "Lab", na.value = "grey50", guide = "colourbar") + theme_bw() #scale_colour_gradientn uses multiple colors for gradient color # topo.colors(10) terrain.colors(10) rainbow(10) pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=as.numeric(date.month))) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient2(midpoint=6, low=muted("red"), high=muted("blue")) + theme_bw() pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=season)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point( size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_brewer(palette = "Set1")+theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_all_seasons.pdf")) pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=humid)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point( size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradientn(colours = topo.colors(10)) +theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_all_humid.pdf")) pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=temperature)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point( size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient2(midpoint=25, low="blue", mid="white", high="red", space = "Lab", na.value = "grey50", guide = "colourbar") +theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_all_temperature.pdf")) pca1.2 = ggplot(scores_single, aes(x=PC1, y=PC2, label=location, color=weekend)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point( size = 3.5) + geom_text(size = 4, nudge_y = 0.5) +theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_all_weekend.pdf")) #========PCA Campushome plots pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=as.numeric(date.month))) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red") + theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_month.pdf")) pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=season)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point( size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_brewer(palette = "Set1")+theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_season.pdf")) pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=humid)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point( size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red") +theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_humid.pdf")) pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=duration)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red", space = "Lab") + theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_duration.pdf")) pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=temperature)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red", space = "Lab") + theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_temperature.pdf")) pca1.2 = ggplot(campushome_single, aes(x=PC1, y=PC2, label=location, color=weekend)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_campushome_weekend.pdf")) #===============PCA Mike plots pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=as.numeric(date.month))) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red") + theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_month.pdf")) pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=season)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point( size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_brewer(palette = "Set1")+theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_season.pdf")) pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=humid)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point( size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red") +theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_humid.pdf")) pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=duration)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red", space = "Lab") + theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_duration.pdf")) pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=temperature)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + scale_colour_gradient(low="blue", high="red", space = "Lab") + theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_temperature.pdf")) pca1.2 = ggplot(Mike_single, aes(x=PC1, y=PC2, label=location, color=weekend)) pca1.2 + geom_point(colour = "grey90", size = 5, alpha=0.5) + geom_point(size = 3.5) + geom_text(size = 4, nudge_y = 0.5) + theme_bw() ggsave(paste0("./pdfs/", taxonlevel,"_pca_Mike_weekend.pdf")) ``` ##PCA 3D plots=====================PCA1.3 plots ```{r PCA 3D plots} # 3D PCA library(rgl) plot3d(scores[, 1:3], type='s',radius=0.5, col=as.integer(scores$season)+2, box=FALSE) text3d(scores[, 1:3], texts = location, adj=1.3) legend3d("topright", legend=as.factor(scores$season), pch=16, col=as.integer(scores$season)+2) legend3d("toprgiht", legend=c("spring", "winter", "summer", "fall"), pch=16, col=(as.integer(scores$season)+2)) library(plot3D) with(scores, scatter3D(x = PC1, y = PC2, z = PC3, colvar = as.integer(ownership), ticktype = "detailed", phi=0, bty ="g", pch =20, xlab="PC1" , ylab="PC2", zlab="PC3", cex =1.5, clab = c("Ownership"))) text3D(x = scores$PC1, y = scores$PC2, z = scores$PC3, labels = scores$location, add =TRUE, colkey = FALSE, cex =1.0) ``` # plot heatmap============= ```{r heatmap} setwd("~/Documents/Bioinfo/DNAformal/") plot_aheatmap_genus = function(x, annotate.df, prefix="",main=NULL){ #-------------- # goi.obj: ExpressionSet object containg expression data(log cpm) of gene of interests and phenoData # main: title of plot #--------------- if (is.null(main)) main = "" wval = 12 hval = 15 # add color to col and row # CellType = rownames(x) # ann.col = data.frame("Humid" = annotate.df$X_Mean_Humidity, "Temperature" = annotate.df$Min_TemperatureC, "Particle" = annotate.df$total.particle, "Weekend" = annotate.df$weekend, "Season" = annotate.df$season, "Batch" = annotate.df$batch, rain = as.factor(annotate.df$is_there_rain), Precipitation = annotate.df$Precipitationmm) ann.col = data.frame(Season = annotate.df$season, Geo = annotate.df$geo2, Humid = as.numeric(annotate.df$X_Mean_Humidity), Temperature = annotate.df$Mean_TemperatureC, Particle = annotate.df$total.particle, Duration =annotate.df$duration) if (mole.type == "DNA"){ ann.row = data.frame(Taxon = c("Fungi", "Fungi", "Viridiplantae", "Bacteria","Bacteria", "Bacteria", "Metazoa", "Bacteria", "Metazoa", "Metazoa", "Bacteria", "Metazoa", "Bacteria", "Metazoa", "Viridiplantae", "Alveolata", "Bacteria", "Bacteria", "Archaea", "Bacteria")) ann.row$Taxon = factor(ann.row$Taxon, levels = c("Bacteria", "Fungi", "Viridiplantae", "Metazoa","Alveolata", "Archaea")) } if (mole.type == "RNA"){ ann.row = data.frame(Taxon = c("Bacteria","Bacteria", "Bacteria","Fungi", "Viridiplantae", "Fungi", "Bacteria", "Bacteria","Bacteria","Bacteria", "Metazoa", "Bacteria", "Bacteria", "Metazoa", "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Metazoa", "Bacteria")) ann.row$Taxon = factor(ann.row$Taxon, levels = c("Bacteria", "Fungi", "Viridiplantae", "Metazoa","Alveolata", "Archaea")) } # Specify colors # Var1 = rainbow(length(unique(CellType)),alpha=0.75) # names(Var1) = c(unique(CellType)) # ann.colors = list(CellType = Var1) seasoncolor = c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF") taxoncolor = c("#F8766D", "#B79F00", "#00BA38", "#00BFC4", "#619CFF", "#F564E3") names(seasoncolor) = c("spring", "summer", "fall", "winter") names(taxoncolor) = c("Bacteria", "Fungi", "Viridiplantae", "Metazoa","Alveolata", "Archaea") #Var2 = c("lightgreen", "navy") ann_colors = list(Season = seasoncolor, Taxon = taxoncolor) #color=colorRampPalette(c("#ffeda0", "#feb24c", "#f03b20"))(3) color = "-RdBu:50" #color = "-RdYlBu:80" # choose dist and hc methods distfunvals = c("euclidean", "pearson", "spearman") hclustfunvals = c("centroid", "average", "complete") distfunval = distfunvals[1] hclustfunval = hclustfunvals[2] #scale="row" # scale by row # NA is different from FALSE for Colv and Rowv, FALSE means clustering is still calcuated just not shown. NA means not doing anything # change Colv = NA for timelapse Colv = NA # add dendrogram to col Rowv = NA # add dendrogram to row fileName = paste(prefix,"_", distfunval, "_", hclustfunval, ".pdf", sep="") fileName = paste(format(Sys.time(), "%d_%b_%Y"), fileName, sep="_") #pdf(file.path(getwd(), fileName), width=wval, height=hval) labcol = as.character(annotate.df$date.mid) #aheatmap(x, color = color,distfun = distfunval, hclustfun = hclustfunval,fontsize = 8,main=fileName, filename=fileName) #set cellwidth = 3 for large number of samples aheatmap(x, cellwidth=3, distfun = distfunval, hclustfun = hclustfunval, labCol = NA, annCol = ann.col, annRow = ann.row, annColors = ann_colors, Colv=Colv,Rowv=Rowv, color = color, na.color = "#d9d9d9", fontsize = 19, filename=fileName, width = 20, height =12, layout = c("dmal", "maldL*"), cexAnn = 1.6) # main = main) #"|.L*" } my.keep = TRUE if (taxonlevel != "SUPERKINGDOM" ){ my.keep = 2:21 #this is intended for skipping "unclassified", please check } temp.asinh.df = master.asinh.cpm.single temp.asinh.df[temp.asinh.df==0] = NA #set NA plot_aheatmap_genus(x=temp.asinh.df[my.keep, ], annotate.df = scores_single_complete, prefix=paste0(mole.type,"-", taxonlevel), main=NULL) ``` ##Plot hierarchial cluster=============== ```{r hierarchial cluster} #d = vegdist(t(master.asinh.cpm), na.rm = TRUE) #since I am to plot master.cpm data, should not use master.asinh.spm here) d = vegdist(t(master.cpm.single), na.rm = TRUE) dendro = as.dendrogram(hclust(d, method = "average")) library(ggdendro) #mole.type #taxonlevel ggdendrogram(dendro, rotate=TRUE, theme_dendro = FALSE) + ggtitle(paste0(mole.type, "-", taxonlevel, " data clustering_not normalized"))+ theme_pubr() dend_data = dendro_data(dendro, type = "rectangle") names(dend_data) dendro.labels = dend_data$labels$label dendro.labels = as.character(dendro.labels) #this is to extract location information for dendrogram (need updates) what is this for? dendro.location = scores_single_complete$location[match(dendro.labels, scores_single_complete$samplenames)] dendro.location = rev(dendro.location) ``` ## plot bar plot by the order of hierarichal cluster!======= ```{r barplot} #this is best working with superkingdom data for now, might expand in the future. #profile is supposed to be a derivative of scores_single_complete expobarplot = function(profile, width = 16, height= 4, ratio=1){ #profile = scores_single_complete master.cpm.snap = master.cpm[, profile$samplenames] d = vegdist(t(master.cpm.snap), na.rm = TRUE) dendro = as.dendrogram(hclust(d, method = "average")) library(ggdendro) #ggdendrogram(dendro, rotate=TRUE, theme_dendro = FALSE) + ggtitle(paste0(mole.type, "-", taxonlevel, " data clustering_not normalized"))+ theme_pubr() dend_data = dendro_data(dendro, type = "rectangle") #these are labels and location label for each sample dendro.labels = dend_data$labels$label dendro.labels = as.character(dendro.labels) dendro.location = scores$location[match(dendro.labels, scores$samplenames)] print(dendro.location) dendro.geo2 = scores$geo2[match(dendro.labels, scores$samplenames)] dendro.date = scores$date.mid[match(dendro.labels, scores$samplenames)] sum.all = apply(master.cpm.snap,2,sum) master.perc = master.cpm.snap #the simplest way to create a data frame of same dimension for (i in 1:ncol(master.perc)){ master.perc[,i] = master.cpm.snap[,i]/sum.all[i]*100 } plot.df = data.frame(t(master.perc)) plot.df$sampleID = rownames(plot.df) plot.df$location = dendro.location plot.df$date = dendro.date plot.df$geo = dendro.geo2 plot.df.hcl = plot.df[dendro.labels, ] #clustered dataframe plot.df$step = 1:nrow(plot.df) #must need continous variable for area plots plot.df.hcl$step = 1:nrow(plot.df.hcl) #must need continuous variable for area plots plot2 = melt(plot.df,id.var=c("sampleID","location", "date", "geo", "step")) plot2.hcl = melt(plot.df.hcl, id.var=c("sampleID","location", "date", "geo", "step")) colnames(plot2)[6]=taxonlevel colnames(plot2.hcl)[6]=taxonlevel #change the colors used for each label if (colnames(plot2)[6] == "SUPERKINGDOM") # This is to unify color scheme of DNA and RNA plots { plot2$SUPERKINGDOM <- factor(plot2$SUPERKINGDOM, levels = c("Bacteria", "Fungi", "Viridiplantae", "Metazoa", "Viruses", "Archaea")) plot2.hcl$SUPERKINGDOM <- factor(plot2.hcl$SUPERKINGDOM, levels = c("Bacteria", "Fungi", "Viridiplantae", "Metazoa", "Viruses", "Archaea")) } #dendro.labels are clustered samples #use scale_color_manual for specify defined qualitative color sets! p = ggplot(plot2.hcl,aes(x=sampleID,y=value,fill = get(taxonlevel))) + geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + scale_x_discrete(name = "Samples", limits=c(dendro.labels)) + ylab("Proportion") + theme_pubr(legend = c("top"), base_size = 16, x.text.angle = 45 ) + theme(axis.text.x = element_blank()) + labs(fill = "Taxons") print(p) ggsave(paste0("~/Documents/Bioinfo/DNAformal/", taxonlevel,"_barplot_clustered.pdf"), height=height, width = width*ratio) #good ratio for wide screen p.area = ggplot(plot2.hcl,aes(x=step,y=value,fill = get(taxonlevel))) + geom_area(position="fill", alpha=1, linetype=0) + theme_pubr(legend = c("top"), base_size = 16, x.text.angle = 45 ) + theme(axis.text.x = element_blank()) + scale_x_continuous(expand = c(0, 0))+ labs(x= "Samples", y="Proportion", fill = "Taxons") print(p.area) ggsave(paste0("~/Documents/Bioinfo/DNAformal/", taxonlevel,"_barplot_clustered_area.pdf"), height=height, width = width*ratio) #good ratio for wide screen #timeline q = ggplot(plot2,aes(x=sampleID, y=value,fill = get(taxonlevel))) + geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + theme(axis.text.x = element_text(angle = 45, hjust=1))+ labs(y = "Proportion", fill="Taxons") + scale_x_discrete(name = "Samples", limits=c(profile$samplenames), labels= c(profile$date.mid)) + theme_pubr(legend = c("top"), base_size = 16) + theme(axis.text.x = element_blank()) print(q) ggsave(paste0("~/Documents/Bioinfo/DNAformal/", taxonlevel,"_barplot_timeline.pdf"), height= height, width = width*ratio) q.area = ggplot(plot2,aes(x=step, y=value,fill = get(taxonlevel))) + geom_area(position = "fill") + theme_pubr(legend = c("top"), base_size = 16, x.text.angle = 45 ) + theme(axis.text.x = element_blank()) + scale_x_continuous(expand = c(0, 0))+ labs(x= "Samples", y="Proportion", fill = "Taxons") print(q.area) ggsave(paste0("~/Documents/Bioinfo/DNAformal/", taxonlevel,"_barplot_timeline_area.pdf"), height= height, width = width*ratio) return(list(cluster=p, cluster.area=p.area, timeline=q, timeline.area=q.area, locations= dendro.location)) } mike.snap = subset(scores_single_complete, ownership == "Mike" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-06-01", "%Y-%m-%d" )) p1.snap = subset(scores_single_complete, ownership == "No1_1636-69-063" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-06-01", "%Y-%m-%d" )) p3.snap = subset(scores_single_complete, ownership == "No3_San Mateo" & date.start >= as.Date("2014-01-01", "%Y-%m-%d" ) & date.start < as.Date("2017-06-01", "%Y-%m-%d" )) scores.snap = scores_single_complete #formal plots expobarplot(profile = scores.snap) mike.snap = Mike_single locations = expobarplot(profile = mike.snap, ratio = nrow(mike.snap)/nrow(scores.snap)) locations1 = expobarplot(profile = p1.snap, ratio = nrow(p1.snap)/nrow(scores.snap)+0.06) locations2 = expobarplot(profile = p3.snap, ratio = nrow(p3.snap)/nrow(scores.snap)+0.06) #old codes p = ggplot(plot2,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) + geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + theme(axis.text.x = element_text(angle = 90, hjust=1))+ scale_x_discrete(name = "Samples", limits=c(dendro.labels)) + scale_fill_manual(values=c("#F3C300", "#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26")) p ggsave(paste0("./pdfs/", taxonlevel,"_barplot_all_distinct.pdf")) p = ggplot(plot2,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) + geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + theme(axis.text.x = element_text(angle = 90, hjust=1))+ scale_x_discrete(name = "Samples", limits=c(rownames(scores_single)), labels= c(scores_single$date.mid)) + scale_fill_manual(values=c("#F3C300", "#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26")) p ggsave(paste0("./pdfs/", taxonlevel,"_barplot_all_timeline_distinct.pdf")) #change labels to dates #codes to rescale the dateframe according things looked at check = match(plot2$sampleID, rownames(campushome_single)) plot2.campushome_single = plot2[which(!is.na(check)),] rownames(plot2.campushome_single) = seq(1:dim(plot2.campushome_single)[1]) p = ggplot(plot2.campushome_single,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) + geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + theme(axis.text.x = element_text(angle = 90, hjust=1))+ scale_x_discrete(name = "Samples", limits=c(rownames(campushome_single)), labels= c(campushome_single$date.mid) ) p ggsave(paste0("./pdfs/", taxonlevel,"_barplot_campushome_timeline.pdf")) p = ggplot(plot2.campushome_single,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) + geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + theme(axis.text.x = element_text(angle = 90, hjust=1))+ scale_x_discrete(name = "Samples", limits=c(rownames(campushome_single)), labels= c(campushome_single$date.mid) ) + scale_fill_manual(values=c("#F3C300", "#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26")) p ggsave(paste0("./pdfs/", taxonlevel,"_barplot_campushome_timeline_distinct.pdf")) #codes to rescale the dateframe according things looked at check = match(plot2$sampleID, rownames(Mike_single)) plot2.Mike_single = plot2[which(!is.na(check)),] rownames(plot2.Mike_single) = seq(1:dim(plot2.Mike_single)[1]) p = ggplot(plot2.Mike_single,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) + geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + theme_gray(base_size = 15) + theme(axis.text.x = element_text(angle = 90, hjust=1))+ scale_x_discrete(name = "Samples", limits=c(rownames(Mike_single)), labels= c(Mike_single$date.mid)) + ylab("Proportion") p ggsave(paste0("./", taxonlevel,"_barplot_Mike_timeline.pdf"), height=8, width = 30) p = ggplot(plot2.Mike_single,aes_string(x=colnames(plot2)[1],y=colnames(plot2)[3],fill = taxonlevel)) + geom_bar(position="stack",stat="identity", color="black", size=0.1, width=1) + theme(axis.text.x = element_text(angle = 90, hjust=1))+ scale_x_discrete(name = "Samples", limits=c(rownames(Mike_single)), labels= c(Mike_single$date.mid)) + scale_fill_manual(values=c("#F3C300", "#875692", "#F38400", "#A1CAF1", "#BE0032", "#C2B280", "#848482", "#008856", "#E68FAC", "#0067A5", "#F99379", "#604E97", "#F6A600", "#B3446C", "#DCD300", "#882D17", "#8DB600", "#654522", "#E25822", "#2B3D26")) p ggsave(paste0("./pdfs/", taxonlevel,"_barplot_Mike_timeline_distinct.pdf")) ``` #cheminfo============== ```{r cheminfo} setwd("~/Documents/Bioinfo/DNAformal") #==================parsing chemi-info list, many samples are still missing as of now, needs more sequencing infile = read.csv("../chemi.csv", stringsAsFactors = FALSE) infile$filter.start=as.Date(infile$filter.start, "%m/%d/%y") infile$filter.collect=as.Date(infile$filter.collect, "%m/%d/%y") test.n = c() for (i in 1:length(infile$filter.start)){ for (j in 1:sum(!is.na(environ.mike$date.start))){ if (infile$filter.start[i] == environ.mike$date.start[j]){ print(c(infile$filter.start[i], environ.mike$date.start[j])) print(c(infile$comments[i], environ.mike$location[j], environ.mike$Filter_No[j])) test.n = c(test.n, environ.mike$Filter_No[j]) } } } test.n chemilist = c(test.n[1:2], "205",test.n[3:length(test.n)]) #205 is the background control infile = cbind(infile, Filter_No=chemilist) #these are the confirmed filter number of interests #chemilist %in% scores_single_complete$Filter_No scores_chemi = scores_single_complete[which(scores_single_complete$Filter_No %in% chemilist),] chemiprofile = read.csv("~/Documents/Bioinfo/places/chemiprofile.csv", stringsAsFactors=FALSE) #chemicals and corresponding IDs with checkable names validchem = chemiprofile$MetabID[chemiprofile$MetabID != ""] validids = chemiprofile$Primary.ID[chemiprofile$MetabID != ""] chemi.meta = data.frame(ID=chemiprofile$Primary.ID, mz=chemiprofile$mz, metaID = chemiprofile$MetabID, retention=chemiprofile$rt, fold=chemiprofile$Fold_Enrich) rownames(chemiprofile) = chemiprofile$Primary.ID cal.df = chemiprofile[,6:(dim(chemiprofile)[2]-3)] cal.df = t.data.frame(cal.df) group = cal.df[,1] cal.df = apply(cal.df[,-1], 2, as.numeric) #this makes it matrix again cal.df = data.frame(cal.df) cal.df$group = c(group) result = split(cal.df, f = cal.df$group) #add each vector into a list, then combine them from list into dataframe new.cal.list = list() for (i in 1:length(result)){ #apply function aways covert your data to matrix, therefore foring all data into one class!! temp = apply(result[[i]][,1:(dim(result[[i]])[2]-1)], 2, mean) #print(unique(result[[i]]$group)) #vectors , just like Matrix, CAN ONLY HAVE ONE TYPE! temp = data.frame(temp) colnames(temp) = as.character(unique(result[[i]]$group)) #print(temp[length(temp)]) new.cal.list[[i]] = temp } new.cal.df = as.data.frame(new.cal.list) #all chemialcs being rows, all samples being columns new.cal.check = new.cal.df colnames(new.cal.check) = infile$X #looks ok colnames(new.cal.df) = infile$Filter_No rownames(new.cal.df) = chemiprofile$Primary.ID[-1] chemi.environ = join(scores_chemi, infile, by="Filter_No") #HOTFIXES chemi.environ$location[18] = "Davis-CA" chemi.environ$location[10] = "Montana" #use chemi.environ for meta information #ordering samples in chronic order, because samples are arranged so in chemi.environ chemi.profile = new.cal.df[, chemi.environ$Filter_No] chemi.profile.check = chemi.profile colnames(chemi.profile.check) = chemi.environ$X colnames(chemi.profile) = chemi.environ$samplenames chem.species.profile = master.cpm.single[, chemi.environ$samplenames] #used 100 chem.species.profile = chem.species.profile[apply(chem.species.profile, 1, sd) > 10 , ] #threshold 1000 chemi.profile = chemi.profile[apply(chemi.profile, 1, sd) > 1000, ] #filtering the uninterested ones #chemi.profile.filter = chemi.profile[rowSums(chemi.profile == 0) <= 4, ] #chem.species.profile.filter = chem.species.profile[rowSums(chem.species.profile == 0) <= 8,] chemi.profile.filter = chemi.profile[rowSums(chemi.profile == 0) <= 4, ] chem.species.profile.filter = chem.species.profile[rowSums(chem.species.profile == 0) <= 8,] #lm.test = lm(chem.species.profile.filter[1,] ~., data = data.frame(t(chem.species.profile.filter))) pca1 = dudi.pca(t(chemi.profile.filter), scal = TRUE, scann = FALSE, nf=3) pca2 = dudi.pca(t(chem.species.profile.filter), scal = TRUE, scann = FALSE, nf=3) #this step is slow, try not to run it #set.seed = 12345 set.seed = 1234 rv1 = RV.rtest(pca1$tab, pca2$tab,499) rv1 #mess around if(FALSE){ X = pca1$tab Y = pca2$tab S.xy <- cov(X, Y) S.xx <- var(X) S.yx <- cov(Y, X) S.yy <- var(Y) A <- eigen(solve(S.xx) %*% S.xy %*% solve(S.yy) %*% S.yx)$vectors B <- eigen(solve(S.yy) %*% S.yx %*% solve(S.xx) %*% S.xy)$vectors R <- sqrt(eigen(solve(S.yy) %*% S.yx %*% solve(S.xx) %*% + S.xy)$values) } chemi.profile.filter = log(chemi.profile.filter+1, base=10)#log transformation is typical in chemical world chem.species.profile.filter = asinh(chem.species.profile.filter) ccaRes = PMA::CCA(t(chem.species.profile.filter), t(chemi.profile.filter), penaltyx = 0.20, penaltyz = 0.15) ccaRes combined = cbind(t(chem.species.profile.filter[ccaRes$u != 0, ]), t(chemi.profile.filter[ccaRes$v != 0, ])) #everything, without sCCA selection combined.original = cbind(t(chem.species.profile.filter), t(chemi.profile.filter)) #pcaRes = dudi.pca(combined.original, scannf = FALSE, nf = 3) chem_first_batch = c("chaoexpo22_I12", "chaoexpo51_I6" , "chaoexpo56_I5", "chaoexpo44_I15", "chaoexpo28_I15") #function for drawing all plots) chempca = function(pcaRes=pcaRes,eclispe=0.7, base.size = 14, ratio=2, ind.alpha = 0.8, bi.alpha=0.6){ scree = fviz_screeplot(pcaRes, addlabels = TRUE ) print(scree) #working plots ind.1 = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$season[6:20], pointsize=5, pointshape=21, palette="npg", alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe ,invisible = "quali") +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + geom_text_repel(aes(label = chemi.environ$location[6:20]), size=4) + theme_pubr(margin = FALSE, base_size = base.size) + labs(fill = "Season", color = "Season") + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) + scale_color_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) print(ind.1) ind.1.clear = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$season[6:20], pointsize=5, pointshape=21, palette="npg", alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe ,invisible = "quali" ) +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + theme_pubr(margin = FALSE, base_size= base.size) + labs(fill = "Season", color = "Season") + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) + scale_color_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) print(ind.1.clear) ind.1.path =fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$season[6:20], pointsize=5, pointshape=21, palette="npg", alpha.ind = ind.alpha-0.2, addEllipses =TRUE, ellipse.level = eclispe ,invisible = "quali" ) + geom_segment(aes(xend=c(tail(pcaRes$li$Axis1, n=-1), NA), yend=c(tail(pcaRes$li$Axis2, n=-1), NA)), arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"), linetype=1, size=1, alpha=0.7,color = c(rep("#3C5488FF",6), rep("#00A087FF",9))) + coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + theme_pubr(margin = FALSE, base_size = base.size) + labs(fill = "Season", color = "Season") + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) + scale_color_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) print(ind.1.path) ind.2 = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$location[6:20], pointsize=5, pointshape=21, alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe,invisible = "quali" ) +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + geom_text_repel(aes(label = chemi.environ$comments[6:20]), size=4) + theme_pubr(margin = FALSE, base_size = base.size) + scale_fill_npg() + scale_color_npg() + labs(fill = "Location", color = "Location") print(ind.2) ind.2.clear = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$location[6:20], pointsize=5, pointshape=21, alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe,invisible = "quali" ) +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + theme_pubr(margin = FALSE, base_size = base.size) + scale_fill_npg() + scale_color_npg() + labs(fill = "Location", color = "Location") print(ind.2.clear) ind.3 = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$season[6:20], pointsize=5, pointshape=21, palette="npg", alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe,invisible = "quali" ) +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + geom_text_repel(aes(label = as.character(chemi.environ$date.start[6:20])), size=4) + theme_pubr(margin = FALSE, base_size = base.size) + labs(fill = "Season", color = "Season") + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) + scale_color_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) print(ind.3) ind.4 = fviz_pca_ind(pcaRes, axes = c(1,2), label="none", fill.ind = chemi.environ$location[6:20], pointsize=5, pointshape=21, palette="npg", alpha.ind = ind.alpha, addEllipses =TRUE, ellipse.level = eclispe,invisible = "quali" ) +coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + geom_text_repel(aes(label = as.character(chemi.environ$date.start[6:20])), size=4) + theme_pubr(margin = FALSE, base_size = base.size) + scale_fill_npg() + scale_color_npg() + labs(fill = "Location", color = "Location") print(ind.4) chem.special = c("PM1888", "PM0594", "PM1527", "PM1013", "PM0006", "PM1180", "PM3175", "PM3177", "NM1772", "Tricholoma", "Trametes", "Serpula","Psychrobacter", "Fomitopsis", "Pseudogymnoascus", "Ophiognomonia", "Phlebiopsis", "Gymnopus", "Lentinus", "Coniosporium") names.features = c(head(rownames(pcaRes$c1)[order(pcaRes$c1$CS1)],10),tail(rownames(pcaRes$c1)[order(pcaRes$c1$CS1)],20), head(rownames(pcaRes$c1)[order(pcaRes$c1$CS2)],7),tail(rownames(pcaRes$c1)[order(pcaRes$c1$CS2)],14) ) names.features = unique(names.features) gradient.color = c("#3C5488FF","#4DBBD5FF","#E64B35FF") var = get_pca_var(pcaRes) #another way of picking names, based on contributions to axes var$contrib = var$contrib[order(var$contrib[,1]^2,var$contrib[,2]^2, var$contrib[,3]^2, decreasing = TRUE),] #names.features = row.names(var$contrib[1:50,]) mm = corrplot::corrplot(var$contrib[1:50,], is.corr=FALSE, cl.align.text = "l") contrib = fviz_contrib(pcares, choice = "var", axes = 1:3, top = 100,ggtheme = theme_pubr(base_size = 6)) print(contrib) #select.var= list(name=names), var.plot = fviz_pca_var(pcaRes, geom=c("text","arrow"), select.var= list(name=c(names.features, chem.special)), col.var="contrib", labelsize = 4, #gradient.cols = "npg", gradient.cols = c("#3C5488FF","#4DBBD5FF","#E64B35FF"), repel = TRUE) + theme_pubr(margin = FALSE, base_size = base.size) #plot features(loadings) print(var.plot) names.biplot = c(head(rownames(pcaRes$c1)[order(pcaRes$c1$CS1)],4),tail(rownames(pcaRes$c1)[order(pcaRes$c1$CS1)],8), head(rownames(pcaRes$c1)[order(pcaRes$c1$CS2)],3),tail(rownames(pcaRes$c1)[order(pcaRes$c1$CS2)],6) ) #biplot bi.plot = fviz_pca_biplot(pcaRes, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = chemi.environ$season[6:20], #col.ind = fourpeople.select$ownership, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 6, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.4 , repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, addEllipses = FALSE, #ellipse.type = "euclid", ellipse.level = 0.80, ellipse.alpha = 0.4, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Season", color = "Contribution", alpha = "Contribution") + #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1])) + coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4) + geom_text_repel(aes(label = as.character(chemi.environ$date.start[6:20])), size=4, nudge_y=1) + geom_segment(aes(xend=c(tail(pcaRes$li$Axis1, n=-1), NA), yend=c(tail(pcaRes$li$Axis2, n=-1), NA)), arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"), linetype=1, size=1, alpha=0.7,color = c(rep("#3C5488FF",6), rep("#E64B35FF",9)))+ theme_pubr(margin = FALSE, base_size = base.size) #use geom_segment to ensure there is an arrow for every segment print(bi.plot) bi.plot2 = fviz_pca_biplot(pcaRes, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = chemi.environ$location[6:20], #col.ind = fourpeople.select$ownership, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 6, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.4 , repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, #addEllipses = TRUE, #ellipse.type = "euclid", #ellipse.level = 0.60, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Location", color = "Contribution", alpha = "Contribution") + #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1])) + coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + #geom_text_repel(aes(label = as.character(chemi.environ$comments[6:20])), size=4) + geom_text_repel(aes(label = as.character(chemi.environ$date.start[6:20])), size=4, nudge_y = 1) + geom_segment(aes(xend=c(tail(pcaRes$li$Axis1, n=-1), NA), yend=c(tail(pcaRes$li$Axis2, n=-1), NA)),color="#3C5488FF", arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"), linetype=1, size=1, alpha=0.7)+ scale_fill_d3(palette = "category20") + theme_pubr(margin = FALSE, base_size = base.size) #use geom_segment to ensure there is an arrow for every segment print(bi.plot2) bi.plot.clear = fviz_pca_biplot(pcaRes, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = chemi.environ$season[6:20], #col.ind = fourpeople.select$ownership, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 6, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.4 , repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, addEllipses = TRUE, #ellipse.type = "euclid", ellipse.level = 0.80, ellipse.alpha = 0.3, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Season", color = "Contribution", alpha = "Contribution") + #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1])) + coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + geom_segment(aes(xend=c(tail(pcaRes$li$Axis1, n=-1), NA), yend=c(tail(pcaRes$li$Axis2, n=-1), NA)), arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"), linetype=1, size=1, alpha=0.7,color = c(rep("#4DBBD5FF",6), rep("#E64B35FF",9)))+ theme_pubr(margin = FALSE, base_size = base.size) #use geom_segment to ensure there is an arrow for every segment print(bi.plot.clear) bi.plot2.clear = fviz_pca_biplot(pcaRes, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = chemi.environ$location[6:20], #col.ind = fourpeople.select$ownership, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 6, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.4 , repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, addEllipses = TRUE, #ellipse.type = "euclid", ellipse.level = 0.80, ellipse.alpha = 0.3, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Location", color = "Contribution", alpha = "Contribution")+ #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1])) + coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + scale_fill_npg() + geom_segment(aes(xend=c(tail(pcaRes$li$Axis1, n=-1), NA), yend=c(tail(pcaRes$li$Axis2, n=-1), NA)),color="#3C5488FF", arrow=arrow(angle = 15,length=unit(0.4,"cm"), type="open"), linetype=1, size=1, alpha=0.7)+ theme_pubr(margin = FALSE, base_size = base.size) #use geom_segment to ensure there is an arrow for every segment print(bi.plot2.clear) bi.plot3.clear = fviz_pca_biplot(pcaRes, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = chemi.environ$location[6:20], #col.ind = fourpeople.select$ownership, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 6, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.4 , repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, addEllipses = TRUE, #ellipse.type = "euclid", ellipse.level = 0.80, ellipse.alpha = 0.3, invisible = "quali", alpha.ind = 0.8 #mean.point = FALSE )+ labs(fill = "Location", color = "Contribution", alpha = "Contribution")+ #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1])) + coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + scale_fill_npg() + theme_pubr(margin = FALSE, base_size = base.size) #use geom_segment to ensure there is an arrow for every segment print(bi.plot3.clear) bi.plot4.clear = fviz_pca_biplot(pcaRes, # Individuals geom.ind = "point", geom.var = c("arrow", "text"), fill.ind = chemi.environ$season[6:20], #col.ind = fourpeople.select$ownership, select.var= list(name = c(names.biplot,chem.special)), pointshape = 21, pointsize = 6, palette = "npg", #addEllipses = TRUE, # Variables alpha.var =0.4 , repel = TRUE, col.var = "contrib", gradient.cols = gradient.color, label.size = 5, addEllipses = TRUE, #ellipse.type = "euclid", ellipse.level = 0.80, ellipse.alpha = 0.3, invisible = "quali", #mean.point = FALSE )+ labs(fill = "Season", color = "Contribution", alpha = "Contribution")+ #coord_fixed(sqrt(pcaRes$eig[2] / pcaRes$eig[1])) + coord_fixed((pcaRes$eig[2] / pcaRes$eig[1])^(1/ratio)) + theme_pubr(margin = FALSE, base_size = base.size) #use geom_segment to ensure there is an arrow for every segment print(bi.plot4.clear) plots = list(scree = scree, contrib = contrib, ind.1=ind.1, ind.1.clear = ind.1.clear, ind.1.path = ind.1.path, ind.2=ind.2, ind.2.clear = ind.2.clear, ind.3=ind.3, ind.4=ind.4, var.plot=var.plot, bi.plot=bi.plot, bi.plot2=bi.plot2, bi.plot.clear=bi.plot.clear, bi.plot2.clear = bi.plot2.clear, bi.plot3.clear = bi.plot3.clear, bi.plot4.clear=bi.plot4.clear) return(plots) } #all features #assign weights based on number of features between biotics and abiotics col.weight = c(rep(sqrt(dim(chemi.profile.filter)[1]/dim(chem.species.profile.filter)[1]), dim(chem.species.profile.filter)[1]), rep(1, dim(chemi.profile.filter)[1])) pcaRes = dudi.pca(combined.original[6:20,], scale = TRUE,scannf = FALSE, nf = 3, col.w = col.weight) #ignore weights pcaRes = dudi.pca(combined.original[6:20,], scale = TRUE,scannf = FALSE, nf = 3) #cca features pcaRes = dudi.pca(combined[6:20,], scale = TRUE,scannf = FALSE, nf = 3) #DNA only pcaRes = dudi.pca(t(chem.species.profile.filter)[6:20,], scale = FALSE,scannf = FALSE, nf = 3) #chemical only pcaRes = dudi.pca(t(chemi.profile.filter)[6:20,], scale = FALSE,scannf = FALSE, nf =3) chem.plots = chempca(pcaRes, eclispe = 0.68, ratio = 6, base.size=14) ``` ##c-means clustering of chemical stuff ```{r c-means cluster of chemicals} #clustering based on season #chemi_complete = cbind.data.frame(t.data.frame(chemi.profile[, 6:20]), season = chemi.environ[6:20,]$season) #chemi.season = aggregate(. ~ season, chemi_complete, mean) #chemi.season = chemi.season[, -1] #clustering all chemi.season = t.data.frame(chemi.profile[, 6:20]) chemi.season = chemi.season[, colSums(chemi.season)>0] chemi.var = apply(chemi.season, 2, var) var.threshold = quantile(chemi.var, 0.10) #quantile 10% variance filter chemi.season = chemi.season[, apply(chemi.season,2,var)>=var.threshold] #variance filter weight = c(apply(chemi.season, 2, var)) chemi.season.scale = scale(chemi.season) rownames(chemi.season.scale) = chemi.environ[6:20,]$location wtf = t(chemi.season.scale) #determining the number of clusters # Elbow method fviz_nbclust(wtf, kmeans, method = "wss") + geom_vline(xintercept = 6, linetype = 2)+ #this is hardcoded labs(subtitle = "Elbow method") # Silhouette method fviz_nbclust(wtf, kmeans, method = "silhouette")+ labs(subtitle = "Silhouette method") # Gap statistic # nboot = 50 to keep the function speedy. # recommended value: nboot= 500 for your analysis. # Use verbose = FALSE to hide computing progression. set.seed(123) fviz_nbclust(wtf, kmeans, nstart = 25, method = "gap_stat", nboot = 100)+ labs(subtitle = "Gap statistic method") set.seed(33) #for 5 clusters cm = cmeans(wtf, center=5, iter.max=500) table(cm$cluster) #cm$cluster = factor(cm$cluster, levels=c(1:length(table(cm$cluster)))) fviz_cluster(list(data = wtf, cluster=cm$cluster), geom = c("point"), ellipse = TRUE, ellipse.alpha = 0.6, #used to be 0.6 if only points are plotted. ellipse.type = "norm", ellipse.level = 0.68, palette = "npg", repel = TRUE) + theme_pubr(base_size = 18) + scale_color_manual(values=c("1"= "#3C5488FF","2"="#4DBBD5FF","5"="#E64B35FF","4"="#00A087FF", "3"="#F39B7FFF"))+ scale_fill_manual(values=c("1"= "#3C5488FF","2"="#4DBBD5FF","5"="#E64B35FF","4"="#00A087FF", "3"="#F39B7FFF")) colnames(wtf) = c(1:15) tempp = cbind.data.frame(wtf, weight=weight, cluster=cm$cluster, membership=apply(cm$membership, 1, max), taxon = row.names(wtf)) df.season.scale.group = tempp[tempp$membership>0.65,] table(df.season.scale.group$cluster) df.season.melt = melt(df.season.scale.group, id.vars = c("cluster","membership", "taxon","weight")) #df.season.scale.group = df.season.scale[, cm$cluster==3 & cm$membership[,3]>=0.7] #df.season.melt = melt(df.season.scale.group) #colnames(df.season.melt) = c("season", "taxon", "value") df.season.melt$variable = factor(df.season.melt$variable) df.season.melt$cluster = factor(df.season.melt$cluster) #+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) #green, blue, red, dark blue ggplot(data=df.season.melt, aes(x=variable, y=value, group=taxon, color=cluster, alpha=membership)) + #geom_line(stat="smooth",method = "loess", alpha=0.7)+ geom_line(size=1)+ geom_hline(yintercept = 0, linetype = 2)+ scale_x_discrete(label = chemi.environ[6:20,]$location)+ theme_pubr(base_size = 16, x.text.angle = 45) + labs(y="Scaled Values") + guides(alpha=FALSE, size=FALSE) + scale_color_manual(values=c("1"= "#3C5488FF","2"="#4DBBD5FF","5"="#E64B35FF","4"="#00A087FF", "3"="#F39B7FFF"))+ #scale_color_npg() + facet_wrap(~cluster, scales="free_y", ncol=5, strip.position = "top") + theme(strip.background = element_blank(), strip.placement = "outside") #============================================================== #plotting chemicals with patterns similar to selected chemicals chemi.season.names = colnames(chemi.res$r)[abs(chemi.res$r["PM1013", ])>=0.85] chemi.season.names = colnames(chemi.res$r)[abs(chemi.res$r["PM0006", ])>=0.85] #plotting chemicals with patterns similar to 3175 and 3177 chemi.season.names = colnames(chemi.res$r)[abs(chemi.res$r["PM3175", ])>=0.85|abs(chemi.res$r["PM3177", ])>=0.80] df.season.scale.group = tempp[chemi.season.names,] table(df.season.scale.group$cluster) df.season.melt = melt(df.season.scale.group, id.vars = c("cluster","membership", "taxon","weight")) df.season.melt$variable = factor(df.season.melt$variable) df.season.melt$cluster = factor(df.season.melt$cluster) ggplot(data=df.season.melt, aes(x=variable, y=value, color=cluster, group = taxon, alpha = membership)) + #geom_line(stat="smooth",method = "loess", size=1.5)+ geom_line(size=1)+ geom_hline(yintercept = 0, linetype = 2)+ #geom_smooth(alpha=0.5, size=0.5, se = FALSE)+ scale_x_discrete(label = chemi.environ[6:20,]$location)+ theme_pubr(base_size = 16, x.text.angle = 45) + labs(y="Scaled Values") + guides(alpha=FALSE, size=FALSE) + scale_color_manual(values=c("1"= "#3C5488FF","2"="#4DBBD5FF","5"="#E64B35FF","4"="#00A087FF", "3"="#F39B7FFF")) #examing cluster # of compounds of interets chem.special = c("PM1888", "PM0594", "PM1527", "PM1013", "PM0006", "PM1180", "PM3175", "PM3177", "NM1772", "PM0664", "NM0522", "NM1897", "PM1081", "NM2866", "PM1038", "PM3170", "PM3173") for (item in chem.special) { print(cm$cluster[item]) } ``` ##MDS section ```{r MDS of bray-curtis distance} #MDS for chemi.species.profile chemi.dist = vegdist(t(asinh(chem.species.profile)), na.rm = TRUE, method= "bray") chemi.mds = cmdscale(chemi.dist,eig= TRUE) out=data.frame(k=1:8,eig=chemi.mds$eig[1:8]) ggplot(data=out, aes(x=k, y=eig/sum(chemi.mds$eig)*100)) + geom_bar(stat="identity",width=0.5,color="orange",fill="pink") + labs( x= "Eigenfactors", y = "Percentage of variations explained") + theme_pubr() #MDS chemi.dist = vegan::vegdist(t(log(chemi.profile+1, base = 10)), na.rm = TRUE, method= "bray") chemi.mds = cmdscale(chemi.dist,eig= TRUE) out=data.frame(k=1:8,eig=chemi.mds$eig[1:8]) ggplot(data=out, aes(x=k, y=eig/sum(chemi.mds$eig)*100)) + geom_bar(stat="identity",width=0.5,color="orange",fill="pink") + labs( x= "Eigenfactors", y = "Percentage of variations explained") + theme_pubr() MDS = data.frame(PCo1 = chemi.mds$points[,1], PCo2 = chemi.mds$points[,2]) #labs=rownames(chemi.mds$points)) ggplot(data = MDS, aes(x=PCo1, y=PCo2,color = chemi.environ$season, label = chemi.environ$comments ) ) + geom_point(size=3) + geom_text_repel() + theme_pubr() #plotting mz distributions of PM and NM features #1968 PM, 1331 NM chemi.meta = chemi.meta[2:nrow(chemi.meta),] chemi.meta = data.frame(chemi.meta, mode = c(rep("Positive", 1968), rep("Negative", 1331))) ggdensity(chemi.meta, x = "mz", add = "median", #add.params=list(color="red"), rug = TRUE, color = "mode", fill = "mode", palette = c("#00AFBB", "#E7B800", "#3C5488FF"), ) + labs(x="m/z ratio") + theme_pubr(base_size = 16) wilcox.test(chemi.meta$mz[1:1968], chemi.meta$mz[1969:length(chemi.meta$mz)]) ``` ##cheminfo-direct correlation test ```{r cheminfo-direct correlation test} chem.special = c("PM1888", "PM0594", "PM1527", "PM1013", "PM0006", "PM1180", "PM3175", "PM3177", "NM1772", "PM0664", "NM0522", "NM1897", "PM1081", "NM2866", "PM1038", "PM3170", "PM3173") #pyridine.like = row.names(chemi.res$r)[abs(chemi.res$r["PM0006", ]) > 0.7] #labels = chemiprofile$MetabID[match(row.names(chemi.res$r)[abs(chemi.res$r["PM0006", ]) > 0.7], chemiprofile$Primary.ID)] #write.table(data.frame(ID = pyridine.like, label = labels, cor = chemi.res$r["PM0006", abs(chemi.res$r["PM0006", ]) > 0.7]), file = "~/Documents/Bioinfo/localanalysis/pyridine-like.txt", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "|") chemi.interest = chemi.profile.filter[chem.special, 6:20] chemi.all = chemi.profile.filter[, 6:20] #all chemicals chemi.combined.small = rbind(chemi.interest, chem.species.profile.filter[, 6:20]) chemi.combined.all = rbind.data.frame(chemi.all, chem.species.profile.filter[, 6:20]) #cbind.data.frame is life, there also exists t.data.frame chemi.combined.small.data = cbind.data.frame(t(chemi.combined.small), Location = as.character(chemi.environ$location[6:20]), Date=as.character(chemi.environ$date.start[6:20]), Season = chemi.environ$season[6:20]) chemi.combined.all.data = cbind.data.frame(t(chemi.combined.all), Location = as.character(chemi.environ$location[6:20]), Season = chemi.environ$season[6:20]) #chemi.combined.small.data$Locale[13] = "Davis-CA" chemi.res = rcorr(t(chemi.combined.all), type="pearson") #all samples chemi.res2 = rcorr(t(chemi.combined.small), type="pearson") #attempting to adjust p value, up to debate if this fits exploratory analysis if(FALSE){ #adjusting only features temp.p = chemi.res2$P p.adj.matrix = apply(temp.p, 1, function(x){ p.adjust(x, method="fdr")}) #adjusting all p values temp.p = c(chemi.res2$P) temp.p[is.na(temp.p)] = 0 temp.p.adj = p.adjust(temp.p, method = "fdr") p.adj.matrix = matrix(temp.p.adj, nrow = ncol(chemi.res2$P[1:13,]), ncol = ncol(chemi.res2$P[1:13,]), dimnames = list(rownames = colnames(chemi.res2$P[1:13,]), colnames = colnames(chemi.res2$P[1:13,]))) } #not adjusting #p.adj.matrix = chemi.res2$P #super slow warning if (FALSE){ m = corrplot::corrplot(overallres$r, p.mat = overallres$P, sig.level = 0.05, insig = "blank", tl.cex=0.6, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1)) } m = corrplot::corrplot(chemi.res2$r, p.mat = p.adj.matrix, sig.level = 0.05, order = "hclust", insig = "blank", tl.cex=0.3, type="lower", method="square") m = corrplot::corrplot(chemi.res2$r[1:15,], p.mat = p.adj.matrix[1:15,], sig.level = 0.05, insig = "blank", tl.cex=0.4, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1)) m = corrplot::corrplot(chemi.res2$r[1:17,], p.mat = p.adj.matrix[1:17,], sig.level = 0.05, insig = "blank", tl.cex=0.4, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1)) #this is a good selectoin procedure, anything correlating better than 0.6 with any of the chemical get selected interest.df = chemi.res2$r[1:15, colSums(abs(chemi.res2$r[1:15,]) >= 0.6, na.rm = TRUE)> 0] interest.df = chemi.res2$r[1:15, colSums(abs(chemi.res2$r[1:15,]) >= 0.7, na.rm = TRUE)> 0] corrplot::corrplot(chemi.res2$r[colnames(interest.df),colnames(interest.df)], p.mat = p.adj.matrix[colnames(interest.df), colnames(interest.df)], sig.level = 0.05, insig = "blank", tl.cex=0.5, method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1)) if (FALSE){ #for the additional seasonal chemicals interest.df = chemi.res2$r[1:21, colSums(abs(chemi.res2$r[1:21,]) >= 0.6, na.rm = TRUE)> 0] interest.df = chemi.res2$r[1:21, colSums(abs(chemi.res2$r[1:21,]) >= 0.7, na.rm = TRUE)> 0] corrplot::corrplot(chemi.res2$r[colnames(interest.df),colnames(interest.df)], p.mat = p.adj.matrix[colnames(interest.df), colnames(interest.df)], sig.level = 0.05, insig = "blank", tl.cex=0.5, method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,0.1,0.1,0.1)) } #checking if any chemicals are correlated with other chemicals/biologicals. Seems that there are TONS of chemical features that correlate with seasonal patterns in our dataset colnames(chemi.res$r)[abs(chemi.res$r["PM3175", ])>=0.8|abs(chemi.res$r["PM3177", ])>=0.8] #flower scent #PM1038 Lilac aldehyde, (2R,2'R,5'R)- #body scent chemi.res$r[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")] m = corrplot::corrplot(chemi.res$r[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], p.mat = chemi.res$P[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], sig.level = 0.05, insig = "blank", tl.cex=0.8, type="upper", method = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,10,0.1,0.1)) m = corrplot::corrplot(chemi.res$r[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], p.mat = chemi.res$P[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], sig.level = 0.05, insig = "blank", tl.cex=0.8, type="upper", method = "number", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(0.1,10,0.1,0.1)) #mixed plot for values AND correlations. m = corrplot::corrplot.mixed(chemi.res$r[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], p.mat = chemi.res$P[c("PM0664","NM0522", "NM1897", "PM1081"), c("PM0664","NM0522", "NM1897", "PM1081")], sig.level = 0.05, insig = "blank", tl.pos = "d", tl.cex=0.8,lower = "number", upper = "color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black", mar=c(1,1,1,1)) # npg style #+ scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) #green, blue, red, dark blue a = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM3177), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("PM3177"), limits=c(-2.5,2.5)) + theme_pubr(x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) b = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM3175), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("PM3175"), limits=c(-2.5,2.5)) + scale_x_discrete(labels = chemi.combined.small.data$Location) + theme_pubr(x.text.angle= 45) + scale_fill_manual(values=c("spring"="#00A087FF", "winter"="#3C5488FF")) b.2= ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM3175), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("M3175"), limits=c(-2,2)) + theme_pubr(x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_npg() ggarrange(a, b, nrow =2, common.legend=TRUE, heights = c(1,1.7) ) c = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM1180), fill=Location )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("Geosmin"), limits=c(-2.5,2.5)) + theme_pubr(base_size = 14, x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_npg() d = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM1888), fill=Location )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("Omethoate"), limits=c(-2.5,2.5)) + theme_pubr(base_size = 14,x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_npg() e = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM1013), fill=Location )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("Phthalate"), limits=c(-2.5,2.5)) + scale_x_discrete(labels = chemi.combined.small.data$Location) + theme_pubr(base_size = 14,x.text.angle= 45) + scale_fill_npg() e.2 = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM1013), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("M1013"), limits=c(-3.5,3.5)) + theme_pubr(base_size = 14,x.text.angle= 45) + scale_fill_npg() can = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Canis), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("Canis"), limits=c(-3.5,3.5)) + scale_x_discrete(labels = chemi.combined.small.data$Locale) + theme_pubr(x.text.angle= 45) + scale_fill_npg() ggarrange(c,can, nrow =2, common.legend=TRUE, heights = c(1,1.7)) ggarrange(c, d, e, nrow =3, common.legend=TRUE, heights = c(1,1,1.7)) f = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM1527), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("DEET"), limits=c(-2.5,2.5)) + theme_pubr(base_size = 16, x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_npg() g = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM0594), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("DEG"), limits=c(-2.5,2.5)) + theme_pubr(base_size = 16, x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_npg() g.2 = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(PM0594), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("DEG"), limits=c(-2.5,2.5)) + scale_x_discrete(labels = chemi.combined.small.data$Locale) + theme_pubr(base_size = 16,x.text.angle= 45) + scale_fill_npg() ggarrange(f,g.2, nrow =2, common.legend=TRUE, heights = c(1,1.7)) ggarrange(f,g,e.2, nrow =3, common.legend=TRUE, heights = c(1,1,2)) i = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Rhizopogon), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("Rhizopogon"), limits=c(-2.5,2.5)) + theme_pubr(x.text.angle= 45) + scale_fill_npg() ggarrange(a, i, nrow =2, common.legend=TRUE, heights = c(1,1.7)) k = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Gelatoporia), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("Gelatoporia"), limits=c(-2.0,2.0)) + scale_x_discrete(labels = chemi.combined.small.data$Locale) + theme_pubr(x.text.angle= 45) + scale_fill_npg() ggarrange(a, b.2, k, nrow =3, common.legend=TRUE, heights = c(1,1,2)) k = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Spirosoma), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("Spirosoma"), limits=c(-2.0,2.0)) + scale_x_discrete(labels = chemi.combined.small.data$Locale) + theme_pubr(x.text.angle= 45) + scale_fill_npg() ggarrange(a, b.2, k, nrow =3, common.legend=TRUE, heights = c(1,1,2)) k.2 = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Castanea), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("Castanea"), limits=c(-2.0,2.0)) + scale_x_discrete(labels = chemi.combined.small.data$Locale) + theme_pubr(x.text.angle= 45) + rremove("xlab") + rremove("x.text") +scale_fill_npg() k.2 = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Pedobacter), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("Pedobacter"), limits=c(-2.0,2.0)) + scale_x_discrete(labels = chemi.combined.small.data$Locale) + theme_pubr(x.text.angle= 45) + rremove("xlab") + rremove("x.text") +scale_fill_npg() #seasonal chemicals and abiotics ggarrange(a, b.2, k.2, k, nrow =4, common.legend=TRUE, heights = c(1,1,1,1.7)) l = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(Dyadobacter), fill=Season )) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0("Dyadobacter"), limits=c(-2.0,2.0)) + scale_x_discrete(labels = chemi.combined.small.data$Locale) + theme_pubr(x.text.angle= 45) + scale_fill_npg() ggarrange(a, b.2, l, nrow =3, common.legend=TRUE, heights = c(1,1,2)) #stacking plots together, stored in vector, labels are chemical names, biological names just stay the same corplot.stacker = function(vector, labels = c(), chemical = "PM0006", cheminame = "Pyridine", limit=3, base.size=16){ plot.list=list() for (i in 1:length(vector)) local({ item=vector[i] if(length(labels)){ name = labels[i] } else { name = item } p1 = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(get(item)), fill=Location) ,environment = environment()) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0(name), limits=c(-limit,limit)) + theme_pubr(x.text.angle= 45) + theme_pubr(base_size = base.size) + rremove("xlab") + rremove("x.text") + scale_fill_npg() plot.list[[item]] <<- p1 #super assignment, override scope print(plot.list[[item]]) }) b.plot = ggplot(chemi.combined.small.data, aes(x=Date, y=scale(get(chemical)), fill=Location) ,environment = environment()) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0(cheminame), limits=c(-limit,limit)) + scale_x_discrete(labels = chemi.combined.small.data$Location) + theme_pubr(base_size = base.size, x.text.angle= 45) + rremove("xlab") + scale_fill_npg() plot.list[[cheminame]] = b.plot return(ggarrange(plotlist = plot.list, nrow = length(plot.list), common.legend = TRUE, heights = c(rep(1, length(plot.list)-1),1.7))) } plots.stack = corplot.stacker(colnames(chemi.res2$r)[abs(chemi.res2$r["PM0006", ])>0.7][-1], base.size = 16) plots.stack.chems = corplot.stacker(vector = colnames(chemi.res2$r)[abs(chemi.res2$r["PM1888", ])>0.7][-1], labels = c("Phthalate", "Geosmin", "Caprylic acid"), chemical = "PM1888", cheminame = "Omethoate") plots.stack.show = corplot.stacker(vector = c("PM0594", "NM0522", "PM1527"), labels = c("DEG", "Caproic acid", "DEET"), chemical = "NM1772", cheminame = "Triclosan") plots.stack.test = corplot.stacker(vector = c("PM3170", "PM3173", "PM3175"), labels = c("PM3170", "PM3173", "PM3175"), chemical = "PM3177", cheminame = "PM3177") plots.stack2 = corplot.stacker(c("Peniophora", "Sanghuangporus"), chemical = "PM0594", cheminame="DEG") plots.stack2 = corplot.stacker(c("Trametes", "Peniophora", "Sanghuangporus"), chemical = "PM0664", cheminame="Caproic acid") #1 vs 1 corplot = function(a, b, cheminame = a, cheminame2 = b, limit=3){ a.plot = ggplot(chemi.combined.all, aes(x=Date, y=scale(get(a)), fill=Location) ,environment = environment()) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0(cheminame), limits=c(-limit,limit)) + scale_x_discrete(labels = chemi.combined.small.data$Location) + theme_pubr(margin = FALSE, base_size = 14, x.text.angle= 45) + rremove("xlab") + rremove("x.text") + scale_fill_npg() b.plot = ggplot(chemi.combined.all, aes(x=Date, y=scale(get(b)), fill=Location) ,environment = environment()) + geom_bar(stat="identity") + geom_hline(yintercept = 0, linetype = 2) + scale_y_continuous(name = paste0(cheminame2), limits=c(-limit,limit)) + scale_x_discrete(labels = chemi.combined.small.data$Location) + theme_pubr(margin = FALSE, base_size = 14, x.text.angle= 45) + scale_fill_npg() + rremove("xlab") stack.plot = ggarrange(a.plot, b.plot, nrow =2, common.legend=TRUE, heights = c(1,1.7)) return(stack.plot) } corplot("PM0006", "Gymnopus") corplot("PM0594", cheminame = "DEG", "Peniophora", limit=2) corplot("PM1038", cheminame = "Lilaic_acid", "Sanghuangporus", limit=3) corplot("PM1180", cheminame = "Geosmin", b = "NM1897", cheminame2 = "Caprylic acid", limit=3) corplot("PM0594", cheminame = "DEG", b = "PM0664", cheminame2 = "Caproic acid", limit=3) ggplot(data.frame(chemi.combined.small.data), aes(x=as.numeric(PM0594), y=as.numeric(PM1527))) + geom_point() + geom_smooth(method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) ggplot(data.frame(t(chemi.combined.small.data)), aes(x=as.numeric(M1888), y=as.numeric(M1180))) + geom_point() + geom_smooth(method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), parse = TRUE) ``` ##chemi features direct comparisons ```{r chemi significant features between "campus" and others} chemi.combined.all.data$group = ifelse(chemi.combined.all.data$Location %in% c("Campus","SF-CA"), "Campus", "Non-campus") chemi.melt = melt(chemi.combined.all.data, id.vars = c("Location", "Season", "group")) chemi.location.results = compare_means(formula = value ~ group, data = chemi.melt, group.by = "variable", p.adjust.method = "fdr") chemi.season.results = compare_means(formula = value ~ Season, data = chemi.melt, group.by = "variable", p.adjust.method = "fdr") ``` ##Correlation extraction from large matrix ```{r correlation extraction from large matrix} #a function to extract correlation pairs that have values higher than a certain threshold corextract = function(rcorr, threshold = 0.8, alpha = 0.05){ #rcorr$r #correlation matrix #rcorr$P #p value matrix nameskeep = c() #store all names that satisfy the criteria for (x in rownames(rcorr$r)){ for (y in rownames(rcorr$r)){ #print(rcorr$r[x,y]) #print(rcorr$P[x,y]) if (abs(rcorr$r[x, y]) >= threshold & rcorr$P[x, y] <= alpha & !is.na(rcorr$P[x, y])) { nameskeep = c(nameskeep, x, y) } } } nameskeep = unique(nameskeep) return(list(r = rcorr$r[nameskeep, nameskeep], P = rcorr$P[nameskeep, nameskeep])) } chemi.select = corextract(chemi.res) chemi.select.2 = corextract(chemi.res2) kk = corrplot::corrplot(chemi.select.2$r, p.mat = chemi.select.2$P, sig.level = 0.05, insig = "blank", tl.cex=0.3, type="lower", method="color") nn = corrplot::corrplot(chemi.select$r, p.mat = chemi.select$P, sig.level = 0.05, insig = "blank", tl.cex=0.3, type="lower", method="color") ``` #World map plot ```{r worldmap plot} library(ggmap) #setting up world map of different detail level - watercolor #mymap.big = get_stamenmap(bbox = c(-175, -50, 160, 71), zoom = 5, maptype= "watercolor") #qmap.big = ggmap(mymap.big) mymap.small = get_stamenmap(bbox = c(-175, -50, 160, 71), zoom = 4, maptype= "watercolor") qmap.watercolor = ggmap(mymap.small) #setting up world map of different detail level - mymap.small.lite = get_stamenmap(bbox = c(-175, -50, 160, 71), zoom = 3, maptype= "toner-lite") qmap.lite = ggmap(mymap.small.lite) mymap.small.toner = get_stamenmap(bbox = c(-175, -50, 160, 71), zoom = 3, maptype= "toner") qmap.toner = ggmap(mymap.small.toner) #setting up world map of terrain mymap.small.terrain = get_stamenmap(bbox = c(-175, -50, 160, 71), zoom = 3, maptype= "terrain") qmap.terrain = ggmap(mymap.small.terrain) #setting up NA map geocode("USA") na.map.terrain = get_stamenmap(bbox = c(-130, 23, -65, 53), maptype= "terrain", zoom = 4, force =TRUE) namap.terrain = ggmap(na.map.terrain) na.map.lite = get_stamenmap(bbox = c(-130, 23, -65, 53), maptype= "toner-lite", zoom = 4, force =TRUE) namap.lite = ggmap(na.map.lite) na.map.wc = get_stamenmap(bbox = c(-130, 23, -65, 53), maptype= "watercolor", zoom = 6, force =TRUE) namap.wc = ggmap(na.map.wc) #setting up bayarea map stan = geocode("Stanford University") stan = as.numeric(stan) bayarea.map.terrain = get_stamenmap(bbox = c(stan[1]-0.4, stan[2]-0.3, stan[1]+0.4, stan[2]+0.5), maptype= "terrain") bmap.terrain = ggmap(bayarea.map.terrain) bayarea.map.lite = get_stamenmap(bbox = c(stan[1]-0.4, stan[2]-0.3, stan[1]+0.4, stan[2]+0.5), maptype= "toner-lite", force =TRUE) bmap.lite = ggmap(bayarea.map.lite) bayarea.map.wc = get_stamenmap(bbox = c(stan[1]-0.4, stan[2]-0.3, stan[1]+0.4, stan[2]+0.5), maptype= "watercolor", force =TRUE, zoom=12) bmap.wc = ggmap(bayarea.map.wc) #ggsave("zoom4worldmap.pdf", qmap.small, width= 16, height = 12) p1.place = read.csv("~/Documents/Bioinfo/places/backup/patient1places.csv", header =TRUE, stringsAsFactors = FALSE) p1.place = as.tibble(p1.place[, 1:7]) p3.place = read.csv("~/Documents/Bioinfo/places/backup/P3places.csv", header = TRUE, stringsAsFactors = FALSE) p3.place = as.tibble(p3.place[, 1:7]) mike.place = read.csv("~/Documents/Bioinfo/places/backup/Mikeplaces.csv", header = TRUE, stringsAsFactors = FALSE) mike.place = as.tibble(mike.place[, 1:7]) #environ.mike #environ.no3 #environ.p1 #fitering place data according to filter data datecompare = function(environ, place){ select.list = c() #environ = environ.mike #place = mike.place for (i in 1:dim(environ)[1]){ select = (as.Date(place$Date, "%m/%d/%y") >= as.Date(environ[i,]$date.start)) & (as.Date(place$Date, "%m/%d/%y") <= as.Date(environ[i,]$date.end)) select.list = c(select.list, row.names(place)[select]) #print(row.names(place)[select]) } select.list = as.numeric(unique(select.list)) #print(select.list) return( place[select.list, ]) } mike.place.filtered = datecompare(environ.mike, mike.place) p1.place.filtered = datecompare(environ.p1, p1.place) p3.place.filtered = datecompare(environ.no3, p3.place) # "#8491B4FF" grey, "#00A087FF" greenish. # scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) #green, blue, red, dark blue #shape 19 are dots, shape 1 are circles #old testing chunk if(FALSE){ qmap.small + geom_point(data = mike.place.filtered, aes(x=Longitude, y=Latitude), alpha = 0.1, color = "#00A087FF", shape = 1, size=3) + geom_point(data = p1.place.filtered, aes(x=Longitude, y=Latitude), alpha = 0.1, color = "#E64B35FF", shape = 1, size=3) + geom_point(data = p3.place.filtered, aes(x=Longitude, y=Latitude), alpha = 0.1, color = "#3C5488FF", shape = 1, size=3) + labs(x="Longitude", y="Latitude") } #ggsave("worldmap_watercolor.pdf", width=12, height=8) #setting up regional maps for USA and Europe #mapobj is a time-consuming step, should be done beforehand plotmap = function(mapobj, point.shape = 19, point.size = 1.5, point.alpha=0.1, file.name = "map_output.pdf"){ mapobj + geom_point(data = mike.place.filtered, aes(x=Longitude, y=Latitude), alpha = point.alpha, color = "#00A087FF", shape = point.shape, size=point.size) + geom_point(data = p1.place.filtered, aes(x=Longitude, y=Latitude), alpha = point.alpha, color = "#E64B35FF", shape = point.shape, size=point.size) + geom_point(data = p3.place.filtered, aes(x=Longitude, y=Latitude), alpha = point.alpha, color = "#3C5488FF", shape = point.shape, size=point.size) + labs(x="Longitude", y="Latitude") ggsave(paste0("~/Documents/Bioinfo/DNAformal/", file.name), width=8, height=6) } plotmap(qmap.lite, point.alpha = 0.1, point.shape = 19, file.name = "world.lite.pdf") plotmap(qmap.terrain, point.shape = 1, point.alpha = 0.4, file.name = "world.terrain.pdf") plotmap(bmap.terrain, point.alpha = 0.4, file.name = "bayarea.terrain.pdf", point.shape = 1) plotmap(bmap.lite, point.alpha = 0.4, file.name = "bayarea.lite.pdf", point.shape = 19) plotmap(bmap.wc, point.alpha = 0.4, file.name = "bayarea.wc.pdf", point.shape=19) plotmap(namap.lite, point.alpha = 0.4, file.name = "na.lite.pdf", point.shape =19) plotmap(namap.terrain, point.alpha = 0.4, file.name = "na.terrain.pdf", point.shape =1) ``` ##MEM plotting and analyses ```{r MEM variables plotting and investigation} #please load basic map from last chunck of code "world map plot" # expo.xy was defined during sptail analysis #MEM.select was defined in spatial variable construction plot.MEM.select = MEM.select colnames(plot.MEM.select) = c("MEM1", "MEM69", "MEM83", "MEM89", "MEM91", "MEM94") plotmap.MEM = function(mapobj, point.shape = 21, point.alpha=0.4, file.name = "map_output.pdf"){ for (item in colnames(plot.MEM.select)){ item = item #geom_beeswarm could scatter multiples points on the same spot object704 = mapobj + geom_point(data = cbind.data.frame(expo.xy, plot.MEM.select), aes_string(x="longitude", y="latitude", fill=item), alpha = point.alpha, shape = point.shape, size=4) + scale_fill_gradient2(low = muted("red"), mid = "white", high = muted("blue"), midpoint = 0, space = "Lab", na.value = "grey50", guide = "colourbar")+ labs(x="Longitude", y="Latitude") print(object704) } #ggsave(paste0("~/Documents/Bioinfo/DNAformal/", file.name), width=8, height=6) return(object704) } object704 = plotmap.MEM(qmap.watercolor) MEM.res = rcorr(plot.MEM.select, t(master.asinh.cpm.single), type="pearson") #takes output from rcorr function, adjusting p value matrix accroding to fdr principles matrixPadjust = function(res, select = c(1:nrow(res$P))){ print(select) temp.p = c(res$P) temp.p[is.na(temp.p)] = 0 temp.p.adj = p.adjust(temp.p, method = "fdr") p.adj.matrix = matrix(temp.p.adj, nrow = ncol(res$P), ncol=ncol(res$P), dimnames = list(rownames = colnames(res$P), colnames = colnames(res$P))) return(p.adj.matrix) } adjusted.res2.p = matrixPadjust(MEM.res) #pls package also has a function called corrplot corrplot::corrplot(MEM.res$r, order = "hclust", p.mat = adjusted.res2.p, sig.level = 0.01, insig = "blank", diag = FALSE, tl.cex=0.5, addrect = 12, method="color", outline=TRUE, col=brewer.pal(n=10, name="PuOr"), tl.srt=45, tl.col="black") for (i in 1:6){ print(MEM.res$r[i,][abs(MEM.res$r[i,]) > 0.4]) print(adjusted.res2.p[i,][abs(MEM.res$r[i,]) > 0.4]) } #coding difference values into geo regions plot(plot.MEM.select[, "MEM94"], master.cpm.single["Ophiognomonia", ]) MEM94.cate <- cut(plot.MEM.select[, "MEM94"], breaks = c(-Inf, 0, 0.04, 0.06, Inf), labels = c("Campus", "Non-US", "US" , "P2-location"), right = FALSE) plot(MEM94.cate, master.asinh.cpm.single["Ophiognomonia", ]) table(MEM94.cate) #check categories assignment ggplot(cbind.data.frame(expo.xy, cate=MEM94.cate), aes(x = longitude, y=latitude, color = cate)) + geom_beeswarm(alpha=0.4) + theme_pubr() + scale_color_nejm() ggplot(cbind.data.frame(category = MEM94.cate, Ophiognomonia=master.asinh.cpm.single["Ophiognomonia", ]), aes(x=category, y=Ophiognomonia, fill= category)) + geom_boxplot() + theme_pubr(base_size = 16) + scale_color_nejm() plotmap.MEM94 = function(mapobj, point.shape = 21, point.alpha=0.4, file.name = "map_output.pdf"){ #geom_beeswarm could scatter multiples points on the same spot object704 = mapobj + geom_beeswarm(data = cbind.data.frame(expo.xy, Ophiognomonia = master.asinh.cpm.single["Ophiognomonia", ], MEM94 = plot.MEM.select[, "MEM94"]), aes_string(x="longitude", y="latitude", fill="Ophiognomonia"), alpha = point.alpha, shape = point.shape, size=4) + scale_fill_gradient2(low = muted("red"), mid = "white", high = muted("blue"), midpoint = 0, space = "Lab", na.value = "grey50", guide = "colourbar")+ labs(x="Longitude", y="Latitude") print(object704) #ggsave(paste0("~/Documents/Bioinfo/DNAformal/", file.name), width=8, height=6) return(object704) } #object269 = plotmap.MEM94(bmap.wc) #MEM83 #positive results from pearson correlations MEM83.relevant = c("Rubritalea", "Anaerobacillus", "Flagellimonas", "Psychrilyobacter", "Ophiognomonia") apply(master.asinh.cpm.single[MEM83.relevant,], 1, function(x){sum(x>0)}) for (item in MEM83.relevant[-5]){ plot(plot.MEM.select[, "MEM83"], master.asinh.cpm.single[item, ], ylab=item) } ``` #Occasional exposure ```{r Occasional exposure} #occasional exposure requires case by case validation, as such, one should be very careful when using species level results species.list = c("Streptococcus pneumoniae", "Klebsiella pneumoniae", "Bacillus anthracis", "Dolosigranulum pigrum", "Acinetobacter baumannii", "Staphylococcus aureus", "Haemophilus influenzae", "Haemophilus parainfluenzae", "Peptoclostridium difficile", "Enterobacter cloacae", "Clostridium perfringens","Gemella haemolysans") #https://www.moldbacteria.com/mold-types.html molds.list = c("Penicillium capsulatum", "Cladosporium sphaerospermum","Aureobasidium pullulans","Aspergillus fumigatus", "Aspergillus niger", "Stachybotrys chartarum", "Alternaria alternata", "Fusarium fujikuroi", "Trichoderma asperellum") claim.list = c("Phytophthora lateralis", "Metaseiulus occidentalis", "Pediculus humanus", "Dermatophagoides farinae","Tetranychus urticae", "Rhagoletis zephyria", "Sphyracephala brevicornis", "Condylostylus patibulatus","Aedes albopictus", "Apis mellifera", "Blattella germanica", "Adineta vaga") #"Acyrthosiphon pisum" #"Streptococcus suis", #name checking for plot #row.names(master.cpm.single)[grepl("Pseudomonas", row.names(master.cpm.single), ignore.case=TRUE)] row.names(master.cpm.single)[grepl("aspergillus", row.names(master.cpm.single), ignore.case=TRUE)] if(FALSE){ check_outlier <- function(v, coef=1.5){ quantiles <- quantile(v,probs=c(0.25,0.75)) IQR <- quantiles[2]-quantiles[1] res <- v < (quantiles[1]-coef*IQR)|v > (quantiles[2]+coef*IQR) return(res) } } #for occaions exposures, use this function check_outlier = function(v, coef = 0.1){ threshold = max(v) * coef res = (v >= threshold) return(res)} generalexpo = function(v = c(), profile, base.size=14){ #profile = scores_single_complete #v = species.list .e = environment() expo.master.cpm = master.cpm.single[, profile$samplenames] expo.combined = cbind.data.frame(profile, t(expo.master.cpm)) plot.list = list() for (item in v) local({ item = item label = rep("", dim(profile)[1]) label[check_outlier(expo.combined[,item])] = as.character(profile$location[check_outlier(expo.combined[,item])]) p = ggplot(expo.combined, aes(x = date.mid, y = get(item), label=label, fill=aownership, size = get(item))) plot = p + geom_point(aes(color = season, fill=aownership), alpha=0.6, shape = 21) + geom_text_repel(show.legend = F)+ theme_pubr(base_size = base.size, margin = FALSE) + scale_color_manual(values=c("#00A087FF","#4DBBD5FF","#E64B35FF","#3C5488FF")) + scale_fill_d3(palette = "category20") + scale_size(range=c(4,7)) + #adjusting the default range to make dots and labels more visible theme(axis.text.x = element_text(size = 12)) + #scale_shape_manual(values= c(21,22,23,24)) + labs(x = "", y = item, fill = "Individual") + guides(size = FALSE) #print(plot) plot.list[[item]] <<- plot #super assignment, override scope print(plot.list[[item]]) }) return(plot.list) } generalexpo.list = generalexpo(species.list, scores_single_complete, base.size = 16) stack = ggarrange(plotlist = generalexpo.list, ncol = 3, nrow=4, legend="none", align = "v") #for molds molds.figures = generalexpo(molds.list, scores_single_complete, base.size =16) stack.molds = ggarrange(plotlist = molds.figures, ncol = 3, nrow=3, legend="none", align = "v") #for claims on animals arthro.figures = generalexpo(claim.list, scores_single_complete, base.size=16) stack.arthro = ggarrange(plotlist = arthro.figures, ncol = 3, nrow=4, legend="none", align = "v") honey_bee_samples = c(colnames(master.asinh.cpm.single)[master.asinh.cpm.single["Apis", ] > 0]) scores_single_complete$season[which(scores_single_complete$samplenames %in% honey_bee_samples)] ``` #popgene ```{r popgene} virus.stats = read.table(file="~/Documents/Bioinfo/popgene/topviruses/topviruses.analysis", sep = "\t", stringsAsFactors = FALSE, fill=TRUE, header=TRUE) virus.stats = cbind.data.frame(virus.stats, name=virus.stats.temp$V2) virus.stats = virus.stats[order(virus.stats$name),] virus.stats.filter = virus.stats[virus.stats$Effective_length/virus.stats$Total_length >= 0.4,] virus.stats.filter = virus.stats.filter[order(virus.stats.filter$SNP),] plot(x=virus.stats.filter$SNP,y=virus.stats.filter$Raw_coverage) plot(x=virus.stats.filter$SNP, y=virus.stats.filter$PI) #overview plots ggplot(virus.stats.filter, aes(x = SNP, y = PI, color = name)) + geom_point(size = 4, alpha=0.6) + geom_smooth(aes(group=1),method="lm", se=FALSE) + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~"), group=1), parse = TRUE) + scale_color_d3("category20") + theme_pubr(base_size = 14, legend = "right", border=TRUE, margin =FALSE) ggplot(virus.stats.filter, aes(x = SNP, y = Raw_coverage, color = name)) + geom_point(size = 4, alpha=0.6) + scale_color_d3("category20") + ylab("Coverage_depth")+ theme_pubr(base_size = 14, legend = "right", border=TRUE, margin = FALSE) ggplot(virus.stats.filter, aes(x = SNP, y = Effective_length/Total_length, color = name)) + geom_point(size = 4, alpha=0.6) + scale_color_d3("category20") + ylab("Coverage_width") + theme_pubr(base_size = 14, legend = "right", border=TRUE, margin = FALSE) snp = ggplot(virus.stats.filter, aes(x = name, y = SNP, color = name)) + geom_point(size = 4, alpha=0.6) + scale_color_d3("category20") + labs(y="SNP/Kbp", x="Species") + theme_pubr(base_size = 10, legend = "none", border=TRUE, margin = FALSE, x.text.angle = 45) pi = ggplot(virus.stats.filter, aes(x = name, y = PI, color = name)) + geom_point(size = 4, alpha=0.6) + scale_color_d3("category20") + labs(y="PI", x="Species") + theme_pubr(base_size = 10, legend = "none", border=TRUE, margin = FALSE, x.text.angle = 60) + rremove("xlab") + rremove("x.text") #stacked ggarrange(plotlist = list(pi, snp), nrow=2, heights= c(1,1.8)) #getting prevalence information for each species, now virus prevalence = c() for (item in virus.stats.filter$name){ prevalence = c(prevalence, assign(item,sum(master.cpm.single[item, ]>0))) } ``` ##popgeneoverall ```{r overall population genetics} virus.stats = read.table(file="~/Documents/Popgenetics/topviruses.final.analysis", sep = "|", stringsAsFactors = FALSE, header=TRUE) fungi.stats =read.table(file="~/Documents/Popgenetics/topfungi.final.analysis", sep = "|", stringsAsFactors = FALSE, header=TRUE) bacteria.stats = read.table(file="~/Documents/Popgenetics/topbacteria.final.analysis", sep = "|", stringsAsFactors = FALSE, header=TRUE) misc.stats = read.table(file="~/Documents/Popgenetics/topmisc.final.analysis", sep = "|", stringsAsFactors = FALSE, header=TRUE) virus.stats = cbind(virus.stats, type = "Viruses") fungi.stats = cbind(fungi.stats, type = "Fungi") bacteria.stats = cbind(bacteria.stats, type = "Bacteria") misc.stats = cbind(misc.stats, type = "Others") virus.stats = virus.stats[order(virus.stats$COV),] fungi.stats = fungi.stats[order(fungi.stats$COV),] #bacteria.stats = bacteria.stats[order(bacteria.stats$Species, bacteria.stats$COV),] bacteria.stats = bacteria.stats[order(bacteria.stats$COV),] misc.stats = misc.stats[order(misc.stats$COV),] all.stats = rbind.data.frame(misc.stats, fungi.stats, bacteria.stats, virus.stats) all.stats$Species = factor(all.stats$Species, levels = c(all.stats$Species)) all.stats$type = factor(all.stats$type, levels = c("Bacteria", "Fungi", "Viruses", "Others")) #> colnames(all.stats) #[1] "Species" "SNV" "PI" "COV" "ESIZE" "SIZE" "type" # c("Bacteria", "Fungi", "Viridiplantae", "Metazoa","Alveolata", "Archaea") #taxoncolor = c("#F8766D", "#B79F00", "#00BA38", "#00BFC4", "#619CFF", "#F564E3") point.size = 7 basesize = 34 #6 was fine COV = ggplot(all.stats, aes(x = Species, y= log(COV, base = 10), fill= type)) + geom_point(shape = 21, alpha=0.8, size =point.size) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = basesize, x.text.angle = 60) + labs(fill = "Kingdom/Subkingdom", y = "Coverage") + theme(axis.text.x = element_text(size = 10)) + scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + geom_hline(yintercept = 2, linetype = 2, color = "#737373") + geom_hline(yintercept = 2.7, linetype = 2, color = "#737373") +rremove("xlab") SNV = ggplot(all.stats, aes(x = Species, y= log(SNV, base = 10), fill= type, group=type)) + geom_point(shape = 21, alpha=0.8, size = point.size) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = basesize) + labs(fill = "Kingdom/Subkingdom", y = "SNPs/Kbp") + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + scale_color_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + geom_hline(yintercept = 2, linetype = 2, color = "#737373") + geom_hline(yintercept = 1, linetype = 2, color = "#737373") + geom_smooth(data = all.stats[-c(1,2),], aes(color=type), method = "lm" ,show.legend = FALSE, se = FALSE) + geom_smooth(aes(group=1), color = "#737373", alpha = 0.6, method = "lm", show.legend=FALSE, se=FALSE) PI = ggplot(all.stats, aes(x = Species, y= log(PI, base = 10), fill= type)) + geom_point(shape = 21, alpha=0.8, size =point.size) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = basesize) + labs(fill = "Kingdom/Subkingdom", y = expression(pi)) + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) +geom_smooth(aes(group=1), color = "#737373", alpha = 0.6,method = "lm", show.legend=FALSE, se=FALSE) ESIZE = ggplot(all.stats, aes(x = Species, y= log(ESIZE, base = 10), fill= type)) + geom_point(shape = 21, alpha=0.8, size =point.size ) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = basesize) + labs(fill = "Kingdom/Subkingdom", y = "Effective Size") + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + geom_hline(yintercept = 6, linetype = 2, color = "#737373") + geom_hline(yintercept = 3, linetype = 2, color = "#737373") SIZE = ggplot(all.stats, aes(x = Species, y= log(SIZE, base = 10), fill= type)) + geom_point(shape = 21, alpha=0.8, size =point.size ) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = basesize) + labs(fill = "Kingdom/Subkingdom", y = "Reference Size") + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + geom_hline(yintercept = 6, linetype = 2, color = "#737373") + geom_hline(yintercept = 3, linetype = 2, color = "#737373") ggarrange(SNV, PI, SIZE, COV, nrow = 4, heights = c(1,1,1,2), common.legend = TRUE, align = "v") #save as 16*40 #supplmentary plots #SNV vs PI ggplot(all.stats, aes(x=SNV, y=PI, color=type)) + geom_point() + geom_smooth(method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..rr.label.., sep = "~~~")), parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) + stat_fit_glance(method = 'lm', method.args = list(formula = y~x), geom = 'text', aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")), label.x.npc = 'right', label.y.npc = 0.2, size = 3) + labs(x = "SNPs/kbp", y = expression(pi), color ="Domain") #COV vs SNV all ggplot(all.stats, aes(x=log(COV, base=10), y=log(SNV, base=10), color=type, group = 1)) + geom_point() + geom_smooth(aes(group=1), method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..rr.label.., sep = "~~~")), parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Coverage)", y = "log(SNPs/kbp)", color ="Domain") + stat_fit_glance(method = 'lm', method.args = list(formula = y~x), geom = 'text', aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")), label.x.npc = 'right', label.y.npc = 0.2, size = 3) #COV vs SNV faceted ggplot(all.stats, aes(x=log(COV, base=10), y=log(SNV, base=10), color=type)) + geom_point() + geom_smooth(method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..rr.label.., sep = "~~~")), parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Coverage)", y = "log(SNPs/kbp)", color ="Domain") + stat_fit_glance(method = 'lm', method.args = list(formula = y~x), geom = 'text', aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")), label.x.npc = 'right', label.y.npc = 0.2, size = 3) + facet_wrap(~type, scales="free", ncol=2) #ESIZE vs SNV ggplot(all.stats, aes(x=log(ESIZE, base=10), y=log(SNV, base=10))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..rr.label.., sep = "~~~")), parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Effective genome size)", y = "log(SNPs/kbp)", color ="Domain")+ stat_fit_glance(method = 'lm', method.args = list(formula = y~x), geom = 'text', aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")), label.x.npc = 'left', label.y.npc = 0.9, size = 4) #+ #facet_wrap(~type, scales="free", ncol=2) #SIZE vs SNV ggplot(all.stats, aes(x=log(SIZE, base=10), y=log(SNV, base=10))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..rr.label.., sep = "~~~")), parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Reference genome size)", y = "log(SNPs/kbp)", color ="Domain")+ stat_fit_glance(method = 'lm', method.args = list(formula = y~x), geom = 'text', aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")), label.x.npc = 'left', label.y.npc = 0.9, size = 4) #+ #ESIZE vs PI ggplot(all.stats, aes(x=log(ESIZE, base=10), y=log(PI))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..rr.label.., sep = "~~~")), parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Effective genome size)", y = expression(log(pi)), color ="Domain")+ stat_fit_glance(method = 'lm', method.args = list(formula = y~x), geom = 'text', aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")), label.x.npc = 'left', label.y.npc = 0.9, size = 4) #+ #ESIZE vs SIZE ggplot(all.stats, aes(x=log(ESIZE, base=10), y=log(SIZE, base=10))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..rr.label.., sep = "~~~")), parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Effective genome size)", y = "log(Reference genome size)", color ="Domain")+ stat_fit_glance(method = 'lm', method.args = list(formula = y~x), geom = 'text', aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")), label.x.npc = 'left', label.y.npc = 0.85, size = 4) #+ #coverage vs effective size ggplot(all.stats, aes(x=log(ESIZE, base=10), y=log(COV, base=10))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..rr.label.., sep = "~~~")), parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Effective genome size)", y = "log(Coverage)", color ="Domain")+ stat_fit_glance(method = 'lm', method.args = list(formula = y~x), geom = 'text', aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")), label.x.npc = 'left', label.y.npc = 0.85, size = 4) #+ #coverage vs reference size looks like reference size correlates better than effective genome size ggplot(all.stats, aes(x=log(SIZE, base=10), y=log(COV, base=10))) + geom_point(aes(color=type)) + geom_smooth(method = "lm") + stat_poly_eq(formula = y~x, #eq.with.lhs = "italic(y)~`=`~", aes(label = paste(..rr.label.., sep = "~~~")), parse = TRUE) + theme_pubr(base_size = 14, margin = FALSE) + labs(x = "log(Reference genome size)", y = "log(Coverage)", color ="Domain")+ stat_fit_glance(method = 'lm', method.args = list(formula = y~x), geom = 'text', aes(label = paste("P-value = ", signif(..p.value.., digits = 4), sep = "")), label.x.npc = 'left', label.y.npc = 0.85, size = 4) #+ #REGRESSION TEST model1 = lm(log(SNV) ~ log(SIZE) + log(COV), data = all.stats) model2 = lm(log(SNV) ~ log(COV) + log(SIZE), data = all.stats) #bacteria-plasmids bac.stats = all.stats[all.stats$type == "Bacteria", ] bac.stats.special = bac.stats[grep("Klebsiella pneumoniae|Nostoc punctiforme|Salmonella enterica|Staphylococcus haemolyticus", bac.stats$Species),] spec.order = c(7,1,8,10,12,15,11, 16,14,13, 3,4, 9, 2, 6,5) bac.stats.special = bac.stats.special[spec.order,] bac.stats.special$genus = c(rep("Klebsiella", 6), rep("Staphylococcus", 4), rep("Salmonella", 2), rep("Nostoc", 4)) bac.stats.special$form = c("genome", rep("plasmid", 5), "genome", rep("plasmid", 3), "genome", "plasmid", "genome", rep("plasmid", 3)) bac.stats.special$Species = factor(bac.stats.special$Species, levels=c(as.character(bac.stats.special$Species))) point.size = 9 #6 was fine COV = ggplot(bac.stats.special, aes(x = Species, y= log(COV, base = 10), fill= genus, shape = form)) + geom_point( alpha=0.8, size =point.size) + scale_x_discrete(all.stats$Species) + theme_pubr(base_size = 28, x.text.angle = 60) + labs(fill = "Strain", y = "Coverage") + theme(axis.text.x = element_text(size = 10)) + scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + scale_shape_manual(values=c(21,23)) SNV = ggplot(bac.stats.special, aes(x = Species, y= log(SNV, base = 10), fill= genus, shape = form)) + geom_point(alpha=0.8, size =point.size) + theme_pubr(base_size = 28) + labs(fill = "Strain", y = "SNP/Kbp") + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + scale_shape_manual(values=c(21,23))#+ geom_hline(yintercept = 2, linetype = 2, color = "#737373") + geom_hline(yintercept = 1, linetype = 2, color = "#737373") PI = ggplot(bac.stats.special, aes(x = Species, y= log(PI, base = 10), fill= genus, shape = form)) + geom_point( alpha=0.8, size =point.size) + theme_pubr(base_size = 28) + labs(fill = "Strain", y = expression(pi)) + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + scale_shape_manual(values=c(21,23)) ESIZE = ggplot(bac.stats.special, aes(x = Species, y= log(ESIZE, base = 10), fill= genus, shape = form)) + geom_point(alpha=0.8, size =point.size ) + theme_pubr(base_size = 28) + labs(fill = "Kingdom/Subkingdom", y = "Effective Size") + rremove("x.axis") + rremove("xlab") + rremove("x.text")+ scale_fill_manual(values = c("#F8766D", "#B79F00", "#C77CFF", "#F564E3")) + scale_shape_manual(values=c(21,23)) ggarrange(SNV, PI, ESIZE, COV, nrow = 4, heights = c(1,1,1,2), common.legend = TRUE, align = "v") bac.fit = lm(SNV ~ COV + ESIZE, data = bac.stats.special) ``` #generating color strips annotation for iTOL tree ```{r color strips annotation for iTOL tree} virus.host = read.csv("~/Documents/Bioinfo/analysis/For_Chao_Virus-Host copy.csv", header = TRUE, stringsAsFactors = FALSE) viruses.id = read.table("~/Documents/Bioinfo/analysis/viruses.id.txt", col.names = "NAME", stringsAsFactors = FALSE) manual.check = viruses.id$NAME[!viruses.id$NAME %in% virus.host$Name] # c("Bacteria", "Fungi", "Viridiplantae", "Metazoa","Alveolata", "Archaea") taxoncolor = c("#F8766D", "#B79F00", "#00BA38", "#00BFC4", "#619CFF", "#F564E3") #Arthropods 0065C4 #Helminth B5F000 #other eukaryotes 8E00EB #Protoza EB00E4 #Homo sapiens EB272A virus.host$color = virus.host$Kingdom #getting custom colors temp.colors = brewer.pal(n = 9, name = "Set1") temp.colors[c(2,4,9,7,6,3)] = taxoncolor virus.host$color = mapvalues(virus.host$color, from = sort(unique(virus.host$Kingdom)), to = temp.colors) output.virus.df = data.frame(name=virus.host$Name, color=virus.host$color, color2=paste0("COL",virus.host$color)) #write.table(output.virus.df, "~/Documents/Bioinfo/analysis/virus.colors.txt", row.names = FALSE, quote = FALSE, sep = " ") #checking colors and labels ggplot(data.frame(a=1:9, b=1:9, color=temp.colors, label=sort(unique(virus.host$Kingdom))), aes(x=a, y=b, color=label, label=label)) + geom_point(size =8) + scale_color_manual(values=temp.colors) ``` ##generating iTOL tree with relative abundance at species level ```{r all species tree} #assuming data at species level is loaded write.table(x=row.names(master.asinh.cpm.single)[rowSums(master.asinh.cpm.single)>0], file="~/Documents/Bioinfo/analysis/allspecies.txt", row.names=FALSE, quote = FALSE) allspecies.tax = read.table("~/Documents/Bioinfo/analysis/allspecies.tax2.txt", sep="|", header = TRUE, strip.white = TRUE, stringsAsFactors = FALSE) for (i in 1:nrow(allspecies.tax)){ #replace all species names with preferred names so phytol can be used if (allspecies.tax$preferred.name[i] != ""){ print(allspecies.tax$preferred.name[i]) allspecies.tax$name[i] = allspecies.tax$preferred.name[i] }} #write.table(allspecies.tax$name, "~/Documents/Bioinfo/analysis/allspecies.tax.fix.txt", quote = FALSE, row.names = FALSE, col.names = FALSE) #setting up asinh.cpm counts select = rownames(master.asinh.cpm.single)[rowSums(master.asinh.cpm.single)>0] relative.counts = rowSums(master.asinh.cpm.single)[select[-1]] #ureplace space with underscore to be consistent with phytol output. newnames<-gsub(" ", "_", allspecies.tax$name, fixed=TRUE) relative.counts.df= data.frame(name = newnames, abundance = relative.counts) relative.counts.df$name[2492] = "Ostreococcus_'lucimarinus'" #write.table(relative.counts.df, "~/Documents/Bioinfo/analysis/allspecies.relativecounts.txt", row.names = FALSE, quote = FALSE, sep = " ") #total abundance plots for fun ggplot(relative.counts.df, aes(x=abundance)) + geom_density() + labs(x="Normalized CPM") + theme_pubr() ``` # Getting new interactions from database global interactions ```{r interactions extraction} library(rglobi) #assuming species name here, genus would work too output.df = data.frame(source = character(), target = character(), interactiontype = character(), sourcetaxon= character(), targettaxon=character()) for (name in c("Homo sapiens", row.names(master.cpm.single))){ #print(name) temp.result = get_interactions_by_taxa(sourcetaxon = name) output.df = rbind.data.frame(output.df, data.frame(source = temp.result$source_taxon_name, target = temp.result$target_taxon_name, interactiontype = temp.result$interaction_type, sourcetaxon=temp.result$source_taxon_path, targettaxon = temp.result$target_taxon_path)) output.df = output.df[!duplicated(output.df),] } write.table(output.df, "~/Documents/bioinfo/localanalysis/interactions.all.csv", row.names = FALSE, col.names = TRUE, quote=FALSE, sep =",") output.match.df = output.df[output.df$source %in% c("Homo sapiens", row.names(master.cpm.single)) & output.df$target %in% c("Homo sapiens", row.names(master.cpm.single)), ] write.table(output.match.df, "~/Documents/bioinfo/localanalysis/interactions.match.csv", row.names = FALSE, col.names = TRUE, quote=FALSE, sep =",") length(unique(output.df$source)) #some interactions are just called interactswith, which is not good enough #limit to cases where source is like "cargo", while target is like "carrier" for integration output.match.defined.df = output.match.df[output.match.df$interactiontype != "interactsWith" & output.match.df$interactiontype %in% c("hasHost", "parasiteOf", "pathogenOf", "symbiontOf", "pollinates"),] write.table(output.match.defined.df, "~/Documents/bioinfo/localanalysis/interactions.match.defined.csv", row.names = FALSE, quote=FALSE, sep=",") output.match.defined.df$sourceclass = output.match.defined.df$sourcetaxon output.match.defined.df$targetclass = output.match.defined.df$targettaxon taxonfinder = function(v){ name.list = c() for (name in v){ if (grepl("Fungi|Ascomycota|Basidiomycota|Stereum|Laetiporus|Phoma herbarum", name)){ tempclass = "Fungi" } else if (grepl("Plantae", ignore.case = TRUE,name)){ tempclass = "Higher Plants" } else if (grepl("Bacteria|Bacillus|Vibrio",ignore.case = TRUE, name)){ tempclass = "Bacteria" } else if (grepl("Arthropoda",ignore.case = TRUE, name)){ tempclass = "Arthropods" } else if (grepl("Nematoda",ignore.case = TRUE, name)){ tempclass = "Helminth" } else if (grepl("Homo sapiens",ignore.case = TRUE,name)){ tempclass = "Homo sapiens" } else if (grepl("Chordata|Mollusca", ignore.case = TRUE,name)){ #print("TRUE") tempclass = "Animals" } else if (grepl("Protozoa",ignore.case = TRUE, name)){ tempclass = "Protozoa" } else if (grepl("Oomycota|Alveolata|Leishmania|Schistosoma",ignore.case = TRUE, name)){ tempclass = "Other eukaryotes" } else { tempclass = name} name.list = c(name.list, tempclass) } return(name.list) } sourceclass = taxonfinder(output.match.defined.df$sourceclass) table(sourceclass) targetclass = taxonfinder(output.match.defined.df$targetclass) table(targetclass) output.match.defined.df$sourceclass = sourceclass output.match.defined.df$targetclass =targetclass #reoder output.final.df = output.match.defined.df[,c(1,6,2,7,3)] #filter things that have the same target and source output.final.df = output.final.df[!output.final.df$source == output.final.df$target,] #fix werid interations output.final.df[output.final.df$source == "Pinus taeda", c(1,2,3,4)] = output.final.df[output.final.df$source == "Pinus taeda", c(3,4,1,2)] output.final.df[output.final.df$source == "Pinus lambertiana", c(1,2,3,4)] = output.final.df[output.final.df$source == "Pinus lambertiana", c(3,4,1,2)] output.final.df[output.final.df$source == "Pseudotsuga menziesii", c(1,2,3,4)] = output.final.df[output.final.df$source == "Pseudotsuga menziesii", c(3,4,1,2)] output.match.df$interactiontype %in% c("hasHost", "parasiteOf", "pathogenOf", "symbiontOf", "pollinates"),] output.final.df = output.final.df[!duplicated(output.final.df),] #this is the maxmial number of edges(nodes) one could have from total datasets, so for other situations, one only need to filter through this one write.csv(output.final.df, "~/Documents/bioinfo/localanalysis/interactions.edges.csv", row.names = FALSE, quote=FALSE) #getting nodes nodes.names = rbind.data.frame(data.frame(id = output.final.df$source, class = output.final.df$sourceclass), data.frame(id = output.final.df$target, class = output.final.df$targetclass)) nodes.final.names = nodes.names[!duplicated(nodes.names),] write.csv(nodes.final.names, "~/Documents/bioinfo/localanalysis/interactions.nodes.csv", row.names = FALSE, quote=FALSE) write.table(output.match.defined.df, "~/Documents/bioinfo/localanalysis/interactions.defined.csv", row.names = FALSE, col.names = TRUE, quote=FALSE, sep =",") #integrate the cargo-carrier network cc.df = read.table("~/Documents/Bioinfo/species-location-interaction_data/SpeciesInteractions_EID2.txt", header = TRUE, stringsAsFactors = FALSE, sep="\t") #this filters interactions that both counterparts can be FOUND in the list of names, which is all species cc.match.df = cc.df[cc.df$Cargo %in% tolower(c("Homo sapiens", row.names(master.cpm.single))) & cc.df$Carrier %in% tolower(c("Homo sapiens", row.names(master.cpm.single))),] #this filter remove edges that are already in the global interaction databases, turns out only a few interactions are left, which means the global interactions are a superior database cc.final.df = cc.match.df[!((cc.match.df$Cargo %in% tolower(output.final.df$source) & cc.match.df$Carrier %in% tolower(output.final.df$target)) | (cc.match.df$Carrier %in% tolower(output.final.df$source) & cc.match.df$Cargo %in% tolower(output.final.df$target))),] write.csv(cc.final.df[,c(1,2,3,4)], "~/Documents/Bioinfo/localanalysis/supplementary.edges.csv", quote=FALSE, row.names=FALSE) #two results from two databases were integrated to generate the newinteractins.finaldb.csv file!! ``` ## Getting nodes and edges file from any profile using new database. ```{r new exposome cloud} #generated from previous chunck new.db = read.csv("~/Documents/Bioinfo/localanalysis/newinteractions.finaldb.csv", header = TRUE, stringsAsFactors = FALSE) #This funciton ASSUMES master.cpm.single is generated at SPECIES level!!!!, the database is only compatible with species level names getcloud = function(profile, prefix="interactions"){ #profile = scores_single_complete temp.df = master.cpm.single[, profile$samplenames] names = c("Homo sapiens", row.names(temp.df)[rowSums(temp.df)>10]) filtered.df = new.db[new.db$source %in% names & new.db$target %in% names,] #write.csv(filtered.df[, c(1,2,3,4)],file=paste0("~/Documents/Bioinfo/localanalysis/", prefix, "-edges.csv"), quote=FALSE, row.names = FALSE) print(paste0(nrow(filtered.df), " edges")) #build nodes table names.df = rbind.data.frame(data.frame(id = filtered.df$source, label = filtered.df$source, class=filtered.df$sourceclass), data.frame(id = filtered.df$target, label = filtered.df$target, class = filtered.df$targetclass)) names.df = names.df[!duplicated(names.df),] #write.csv(names.df, file=paste0("~/Documents/Bioinfo/localanalysis/", prefix, "-nodes.csv"), quote=FALSE, row.names=FALSE) print(paste0(nrow(names.df), " nodes")) #getting human and environment related names human_related.df = subset(filtered.df, sourceclass == "Homo sapiens" | targetclass == "Homo sapiens") human_animal_related.df = subset(filtered.df, sourceclass == "Homo sapiens" | targetclass == "Homo sapiens"| sourceclass == "Animals" | targetclass == "Animals") #lumping human and animal together human_related.names = unique(human_related.df$source) human_animal_related.names = unique(ifelse(human_animal_related.df$targetclass == "Homo sapiens" | human_animal_related.df$targetclass == "Animals", human_animal_related.df$source, human_animal_related$target)) plant_related.df = subset(filtered.df, sourceclass == "Higher Plants" & targetclass != "Higher Plants" | targetclass == "Higher Plants" & sourceclass != "Higher Plants") #lumpin plants and arthropods plant_arthro_related.df = subset(filtered.df, sourceclass == "Higher Plants" & targetclass != "Higher Plants" | targetclass == "Higher Plants" & sourceclass != "Higher Plants" | targetclass == "Arthropods") plant_related.names = unique(ifelse(plant_related.df$targetclass == "Higher Plants", plant_related.df$source, plant_related.df$target)) plant_arthro_related.names = unique(ifelse(plant_arthro_related.df$targetclass == "Higher Plants" | plant_arthro_related.df$targetclass == "Arthropods", plant_arthro_related.df$source, plant_related.df$target)) #core versions of dedicated speciesd names human_only = human_related.names[!(human_related.names %in% plant_related.names)] plant_only = plant_related.names[!(plant_related.names %in% human_related.names)] human_intersect_plant = plant_related.names[plant_related.names %in% human_related.names] #expanded versions of dedicated species names human_animal_only = human_animal_related.names[!(human_animal_related.names %in% plant_arthro_related.names)] plant_arthro_only = plant_arthro_related.names[!(plant_arthro_related.names %in% human_animal_related.names)] human_animal_intersect_plant_arthro = human_animal_related.names[human_animal_related.names %in% plant_arthro_related.names] return(list(plant_related = plant_related.names, plant_arthro_related =plant_arthro_related.names, human_related = human_related.names, human_animal_related = human_animal_related.names, plant_only = plant_only, human_only = human_only, plant_arthro_only = plant_arthro_only, human_animal_only = human_animal_only, human_intersect_plant = human_intersect_plant, human_animal_intersect_plant_arthro = human_animal_intersect_plant_arthro)) } #all samples allsamples.names = getcloud(scores_single_complete, "allsamples") #mike OR P1 P1.names = getcloud(Mike_single, "Mikeall") #P2 P2.names = getcloud(Patient1, "P2") #P3 P3.names = getcloud(Patient3, "P3") #fourpeople getcloud(fourpeople.select[fourpeople.select$ownership == "Mike",], "mike_fourpeople") getcloud(fourpeople.select[fourpeople.select$ownership == "Gw",], "gw") getcloud(fourpeople.select[fourpeople.select$ownership == "Guan_SF",], "sf") getcloud(fourpeople.select[fourpeople.select$ownership == "No3_San Mateo",], "p3_fourpeople") getname = function(profile){ temp.df = master.cpm.single[, profile$samplenames] names = row.names(temp.df)[rowSums(temp.df)>10] print(head(rowSums(temp.df[names,]), 20)) return(names) } mike.names = getname(fourpeople.select[fourpeople.select$ownership == "Mike",]) gw.names = getname(fourpeople.select[fourpeople.select$ownership == "Gw",]) guan.names = getname(fourpeople.select[fourpeople.select$ownership == "Guan_SF",]) no3.names = getname(fourpeople.select[fourpeople.select$ownership == "No3_San Mateo",]) #===================examing variances #filtering master.asinh.cpm.single for features master.asinh.cpm.single.forvar = master.asinh.cpm.single[apply(master.asinh.cpm.single, 1, function(x){sum(x>0) >= 50}), ] master.asinh.cpm.season.forvar = cbind.data.frame(t(master.asinh.cpm.single.forvar[-1,]), season=scores_single_complete$season) df.season.forvar.fin = t(master.asinh.cpm.single[apply(master.asinh.cpm.single, 1, function(x){sum(x>0) >= 50}), ]) df.season.forvar.fin = df.season.forvar.fin[, colSums(df.season.forvar.fin)>0] human_var = apply(df.season.forvar.fin[, allsamples.names$human_only[allsamples.names$human_only %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0])}) #these fungi are not really human associated human_var = human_var[!grepl("Aspergillus|Cladosporium|Penicillium", names(human_var))] names(human_var) l1 = length(human_var) plant_var = apply(df.season.forvar.fin[, allsamples.names$plant_only[allsamples.names$plant_only %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0])}) l2 = length(plant_var) human_intersect_plant_var = apply(df.season.forvar.fin[, allsamples.names$human_intersect_plant[allsamples.names$human_intersect_plant %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0])}) l3 = length(human_intersect_plant_var) var.df = data.frame(variance = c(human_var,plant_var, human_intersect_plant_var ), group = c(rep("Human", l1), rep("Plant", l2), rep("Plant/Human", l3))) comparisons = list(c("Human", "Plant"), c("Plant", "Plant/Human"), c("Human", "Plant/Human")) var.df$grouop = factor(var.df$group, levels = c("Plant", "Plant/Human", "Human")) ggplot(var.df, aes(x=group, y=log(variance), fill=group)) + geom_boxplot() + stat_compare_means(method = "wilcox.test", comparisons = comparisons) + theme_pubr(base_size = 16) + scale_fill_manual(values=c("Plant" = "#20854EFF", "Plant/Human" = "#E18727FF", "Human" = "#BC3C29FF")) + labs(x="") #, label = "p.signif" #0.0380 0.0285 0.2600 ggplot(var.df, aes(x=group, y=log(variance), fill=group)) + geom_boxplot() + stat_compare_means(method = "wilcox.test", comparisons = comparisons, label = "p.signif") + theme_pubr(base_size = 16) + scale_fill_manual(values=c("Plant" = "#20854EFF", "Plant/Human" = "#E18727FF", "Human" = "#BC3C29FF")) + labs(x="") #===================examing variances for broad groups human_animals_var = apply(df.season.forvar.fin[, allsamples.names$human_animal_only[allsamples.names$human_animal_only %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0], na.rm=TRUE)}) human_animals_var = human_animals_var[!grepl("Aspergillus|Cladosporium|Penicillium", names(human_animals_var))] names(human_animals_var) ll1 = length(human_animals_var) plant_arthro_var = apply(df.season.forvar.fin[,allsamples.names$plant_arthro_only[allsamples.names$plant_arthro_only %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0], na.rm=TRUE)}) names(plant_arthro_var) ll2 = length(plant_arthro_var) human_animals_inter_plant_arthro_var = apply(df.season.forvar.fin[, allsamples.names$human_animal_intersect_plant_arthro[allsamples.names$human_animal_intersect_plant_arthro %in% colnames(df.season.forvar.fin)]], 2, function(x){var(x[x>=0], na.rm=TRUE)}) names(human_animals_inter_plant_arthro_var) ll3 = length(human_animals_inter_plant_arthro_var) var2.df = data.frame(variance = c(human_animals_var, plant_arthro_var, human_animals_inter_plant_arthro_var), group = c(rep("Human/Animals", length(human_animals_var)), rep("Plant/Arthropods", length(plant_arthro_var)), rep("Intersection", length(human_animals_inter_plant_arthro_var)))) var2.df$group = factor(var2.df$group, levels = c("Plant/Arthropods", "Intersection", "Human/Animals")) #order of box-plots comparisons = list(c("Human/Animals", "Plant/Arthropods"), c("Plant/Arthropods", "Intersection"), c("Human/Animals", "Intersection")) ggplot(var2.df, aes(x=group, y=log(variance), fill=group)) + geom_boxplot() + stat_compare_means(method = "wilcox.test", comparisons = comparisons) + theme_pubr(base_size = 16, x.text.angle = 30) + scale_fill_manual(values=c("#20854EFF", "#E18727FF", "#BC3C29FF")) + labs(x="") + guides(group=FALSE) #0.0260 0.0219 0.3500 #, label = "p.signif" ggplot(var2.df, aes(x=group, y=log(variance), fill=group)) + geom_boxplot() + stat_compare_means(label.y = 0) + theme_pubr(base_size = 20, x.text.angle = 30) + scale_fill_manual(values=c("#20854EFF", "#E18727FF", "#BC3C29FF")) + labs(x="") + guides(group=FALSE) + rremove(object = "x.text") p.adjust(c(0.0073, 0.013, 0.35), method = "fdr") #ggplot(var2.df, aes(x=group, y=log(variance), fill=group)) + geom_violin() + geom_boxplot(width = 0.3, fill = "white") + theme_pubr(base_size = 20, x.text.angle = 30) + scale_fill_manual(values=c("#20854EFF", "#E18727FF", "#BC3C29FF")) + labs(x="") + guides(group=FALSE) ``` #contig identit alignemnt etc. plots ```{r contigs} bacteria.contigs = read.csv("~/Documents/Expo_paper/bacteria.identity.result.csv", header = TRUE) animal.contigs = read.csv("~/Documents/Expo_paper/animals.result.csv", header = TRUE) molds.contigs = read.csv("~/Documents/Expo_paper/molds.result.csv", header = TRUE) #bacteria #identity p = ggplot(bacteria.contigs, aes(x = strain, y=identity, color=strain, alpha = alignment)) + geom_boxplot() + theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 90, linetype = 2, color = "#737373") + geom_hline(yintercept = 95, linetype = 2, color = "#737373") + labs(y = "Percentage Identity") #bit score q = ggplot(bacteria.contigs, aes(x = strain, y=log(bitscore, base =10), color=strain, alpha = alignment)) + geom_boxplot() + theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 2, linetype = 2, color = "#737373") + labs(y = "log(bitscore, base=10)") #alignment r = ggplot(animal.contigs, aes(x = strain, y=alignment, color=strain, alpha = alignment)) + geom_boxplot() + theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 0.5, linetype = 2, color = "#737373") + labs(y = "Alignment percentage") #animals #identity #align x axis to exposure plot animal.contigs$strain = factor(animal.contigs$strain, levels=c(gsub(" ", "_", claim.list, fixed=TRUE))) p = ggplot(animal.contigs, aes(x = strain, y=identity, color=strain, alpha = alignment)) + geom_boxplot() + theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 90, linetype = 2, color = "#737373")+ labs(y = "Percentage Identity") + rremove("xlab") #bit score q = ggplot(animal.contigs, aes(x = strain, y=log(bitscore, base =10), color=strain, alpha = alignment)) + geom_boxplot() + theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 2, linetype = 2, color = "#737373") + labs(y = "log(bitscore, base=10)")+ rremove("xlab") #alignment r = ggplot(animal.contigs, aes(x = strain, y=alignment, color=strain, alpha = alignment)) + geom_boxplot() + theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 0.5, linetype = 2, color = "#737373") + labs(y = "Alignment percentage")+ rremove("xlab") #molds p = ggplot(molds.contigs, aes(x = strain, y=identity, color=strain, alpha = alignment)) + geom_boxplot() + theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 90, linetype = 2, color = "#737373")+ labs(y = "Percentage Identity") + rremove("xlab") #bit score q = ggplot(molds.contigs, aes(x = strain, y=log(bitscore, base =10), color=strain, alpha = alignment)) + geom_boxplot() + theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 2, linetype = 2, color = "#737373") + labs(y = "log(bitscore, base=10)")+ rremove("xlab") #alignment r = ggplot(molds.contigs, aes(x = strain, y=alignment, color=strain, alpha = alignment)) + geom_boxplot() + theme_pubr(base_size = 14, x.text.angle = 60) + scale_color_d3(palette = "category20") + geom_hline(yintercept = 0.5, linetype = 2, color = "#737373") + labs(y = "Alignment percentage")+ rremove("xlab") ``` # word cloud output ```{r wordcloud} library(scales) master.cpm.single =master.cpm.single[order(rowSums(master.cpm.single), decreasing=TRUE),] values = log(head(rowSums(master.cpm.single),100)) values = round(scales::rescale(values, to = c(2,12))) write.table(data.frame(value = values, name = head(row.names(master.cpm.single), 100)), "~/Documents/Bioinfo/names.table.txt", quote=FALSE, row.names = FALSE) ``` #area plot testing ```{r areaplot} #create data set.seed(3) #time steps t.step<-seq(0,20) #group names grps<-letters[1:10] #random data for group values across time grp.dat<-runif(length(t.step)*length(grps),5,15) #create data frame for use with plot grp.dat<-matrix(grp.dat,nrow=length(t.step),ncol=length(grps)) grp.dat<-data.frame(grp.dat,row.names=t.step) names(grp.dat)<-grps p.dat<-data.frame(step=row.names(grp.dat),grp.dat,stringsAsFactors=F) p.dat<-melt(p.dat,id='step') p.dat$step<-as.numeric(p.dat$step) p<-ggplot(p.dat,aes(x=step,y=value)) p1<-p + geom_area(aes(fill=variable)) + theme(legend.position="bottom") p2<-p + geom_area(aes(fill=variable),position='fill') ``` # graph-based approach test ```{r graph-based method} library("igraph") library("phyloseq") library("phyloseqGraphTest") library("ggnetwork") library("intergraph") ps = readRDS("~/Documents/statscourse/data/ps1.rds") net = make_network(ps, max.dist=0.35) sampledata = data.frame(sample_data(ps)) V(net)$id = sampledata[names(V(net)), "host_subject_id"] V(net)$litter = sampledata[names(V(net)), "family_relationship"] ggplot(net, aes(x=x, y=y, xend=xend, yend=yend), layout = "fruchtermanreingold") + geom_edges(color = "darkgrey") + geom_nodes(aes(color=id, shape=litter)) + theme_pubr(legend = "right") + theme(axis.text = element_blank(), axis.title = element_blank(), legend.key.height =unit(0.6, "line")) + guides(col = guide_legend(override.aes = list(size =0.25))) #MST and Jaccard gt = graph_perm_test(ps, sampletype = "family_relationship", grouping = "host_subject_id", distance = "jaccard", type = "mst", nperm = 200) gt$pval plot_test_network(gt) #+ theme(legend.text = element_text(size=8), #legend.title = element_text(size=9)) plot_permutations(gt) #MST and bray gt = graph_perm_test(ps, sampletype = "family_relationship", grouping = "host_subject_id", distance = "bray", type = "mst", nperm = 200) gt$pval plot_test_network(gt) #+ theme(legend.text = element_text(size=8), #legend.title = element_text(size=9)) plot_permutations(gt) #NNJ and bray gt = graph_perm_test(ps, sampletype = "family_relationship", grouping = "host_subject_id", distance = "bray", type = "knn", knn = 2, nperm = 200) gt$pval plot_test_network(gt) #+ theme(legend.text = element_text(size=8), #legend.title = element_text(size=9)) plot_permutations(gt) ``` ##graph-based application on exposome data ```{r graph-based application on exposome data} library("igraph") library("phyloseq") library("phyloseqGraphTest") library("ggnetwork") library("intergraph") #set up otu matrix otumat = master.asinh.cpm.single #random taxon matrix for now. will update to real one later taxmat = matrix(sample(letters, 7*nrow(otumat), replace = TRUE), nrow = nrow(otumat), ncol = 7) rownames(taxmat) <- rownames(otumat) colnames(taxmat) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species") taxmat #set up sampledata scores sampledata = scores_single_complete sampledata$country[is.na(sampledata$country)] = "USA" row.names(sampledata) = scores_single_complete$samplenames sampledata = sample_data(sampledata) ps.expo = phyloseq(otu_table(otumat, taxa_are_rows =TRUE), tax_table(taxmat), sampledata) if(FALSE){ net = make_network(ps.expo, max.dist=0.7) sampledata = data.frame(sample_data(ps.expo)) V(net)$id = sampledata[names(V(net)), "aownership"] V(net)$geo = sampledata[names(V(net)), "geo2"] ggplot(net, aes(x=x, y=y, xend=xend, yend=yend), layout = "fruchtermanreingold") + geom_edges(color = "darkgrey") + geom_nodes(aes(color=id, shape=geo)) + theme_pubr(legend = "right") + theme(axis.text = element_blank(), axis.title = element_blank(), legend.key.height =unit(0.6, "line")) + guides(col = guide_legend(override.aes = list(size =0.25))) } #owner based gt1 = graph_perm_test(ps.expo, "aownership", distance = "jaccard", type = "mst", nperm = 500) gt1$pval plot_test_network(gt1) plot_permutations(gt1) gt2 = graph_perm_test(ps.expo, "aownership", distance = "bray", type = "mst" , nperm = 500) gt2$pval plot_test_network(gt2) plot_permutations(gt2) gt3 = graph_perm_test(ps.expo, "aownership", distance = "bray", type = "knn", nperm = 500) gt3$pval plot_test_network(gt3) plot_permutations(gt3) gt3 = graph_perm_test(ps.expo, "aownership", distance = "bray", type = "knn", nperm = 500, knn = 2) gt3$pval plot_test_network(gt3) plot_permutations(gt3) #season based gt1 = graph_perm_test(ps.expo, "season", distance = "jaccard", type = "mst", nperm = 500) gt1$pval plot_test_network(gt1) plot_permutations(gt1) gt2 = graph_perm_test(ps.expo, "season", distance = "bray", type = "mst" , nperm = 9999) gt2$pval plot_test_network(gt2) + theme(legend.text=element_text(size=14)) + scale_color_manual(values=c("#E64B35FF","#00A087FF","#4DBBD5FF","#3C5488FF")) plot_permutations(gt2) + theme_pubr(base_size = 16) gt3 = graph_perm_test(ps.expo, "season", distance = "bray", type = "knn", nperm = 500) gt3$pval plot_test_network(gt3) plot_permutations(gt3) gt3 = graph_perm_test(ps.expo, "season", distance = "bray", type = "knn", nperm = 500, knn = 2) gt3$pval plot_test_network(gt3) plot_permutations(gt3) #location based gt4 = graph_perm_test(ps.expo, "geo2", distance = "jaccard", type = "mst", nperm = 500) gt4$pval plot_test_network(gt4) plot_permutations(gt4) gt4 = graph_perm_test(ps.expo, "geo2", distance = "bray", type = "mst" , nperm = 500) gt4$pval plot_test_network(gt4) plot_permutations(gt4) gt5 = graph_perm_test(ps.expo, "geo2", distance = "bray", type = "knn", nperm = 500) gt5$pval plot_test_network(gt5) plot_permutations(gt5) gt5 = graph_perm_test(ps.expo, "geo2", distance = "bray", type = "knn", nperm = 500, knn = 2) gt5$pval plot_test_network(gt5) plot_permutations(gt5) #location based geo3 gt5 = graph_perm_test(ps.expo, "geo3", distance = "bray", type = "mst", nperm = 500) gt5$pval plot_test_network(gt5) plot_permutations(gt5) gt5 = graph_perm_test(ps.expo, "geo3", distance = "bray", type = "knn", nperm = 500, knn = 2) gt5$pval plot_test_network(gt5) plot_permutations(gt5) #locaiton based country gt5 = graph_perm_test(ps.expo, "country", distance = "bray", type = "mst", nperm = 500) gt5$pval plot_test_network(gt5) plot_permutations(gt5) gt5 = graph_perm_test(ps.expo, "country", distance = "bray", type = "knn", nperm = 500, knn = 2) gt5$pval plot_test_network(gt5) plot_permutations(gt5) gt5 = graph_perm_test(ps.expo, "country", distance = "bray", type = "knn", nperm = 500, knn = 1) gt5$pval plot_test_network(gt5) plot_permutations(gt5) graph_comp = function(profile, nperm=500, basesize = 20){ profile = scores_single_complete otumat = master.asinh.cpm.single[, c(profile$samplenames)] #random taxon matrix for now. will update to real one later taxmat = matrix(sample(letters, 7*nrow(otumat), replace = TRUE), nrow = nrow(otumat), ncol = 7) rownames(taxmat) <- rownames(otumat) colnames(taxmat) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species") taxmat #set up sampledata sampledata = profile sampledata$country[is.na(sampledata$country)] = "USA" row.names(sampledata) = profile$samplenames sampledata = sample_data(sampledata) ps.expo = phyloseq(otu_table(otumat, taxa_are_rows =TRUE), tax_table(taxmat), sampledata) #owner based gt = graph_perm_test(ps.expo, "aownership", distance = "jaccard", type = "mst", nperm = nperm) print(gt$pval) owner_ja_mst = plot_test_network(gt) + theme(legend.text=element_text(size=basesize)) print(owner_ja_mst) owner_ja_mst_permu = plot_permutations(gt) + ggtitle(paste0("p value = ", signif( gt$pval, digits = 4))) + theme_pubr(base_size = basesize) print(owner_ja_mst_permu) gt2 = graph_perm_test(ps.expo, "aownership", distance = "bray", type = "mst" , nperm = nperm) print(gt2$pval) owner_bray_mst = plot_test_network(gt2) + theme(legend.text=element_text(size=basesize)) print(owner_bray_mst) owner_bray_mst_permu = plot_permutations(gt2) + ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(owner_bray_mst_permu) gt3 = graph_perm_test(ps.expo, "aownership", distance = "bray", type = "knn", nperm = nperm) print(gt3$pval) owner_bray_knn = plot_test_network(gt3) + theme(legend.text=element_text(size=basesize)) print(owner_bray_knn) owner_bray_knn_permu = plot_permutations(gt3) + ggtitle(paste0("p value = ", signif( gt3$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(owner_bray_knn_permu) #season based gt1 = graph_perm_test(ps.expo, "season", distance = "jaccard", type = "mst", nperm = nperm) print(gt1$pval) season_jac_mst = plot_test_network(gt1) + theme(legend.text=element_text(size=basesize)) + scale_color_manual(values=c("#E64B35FF","#00A087FF","#4DBBD5FF","#3C5488FF")) print(season_jac_mst) season_jac_mst_permu = plot_permutations(gt1) + ggtitle(paste0("p value = ", signif( gt1$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(season_jac_mst_permu) gt2 = graph_perm_test(ps.expo, "season", distance = "bray", type = "mst" , nperm = nperm) print(gt2$pval) season_bray_mst = plot_test_network(gt2) + theme(legend.text=element_text(size=basesize)) + scale_color_manual(values=c("#E64B35FF","#00A087FF","#4DBBD5FF","#3C5488FF")) print(season_bray_mst) season_bray_mst_permu = plot_permutations(gt2)+ ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(season_bray_mst_permu) gt3 = graph_perm_test(ps.expo, "season", distance = "bray", type = "knn", nperm = nperm) print(gt3$pval) season_bray_knn = plot_test_network(gt3) + theme(legend.text=element_text(size=basesize)) + scale_color_manual(values=c("#E64B35FF","#00A087FF","#4DBBD5FF","#3C5488FF")) print(season_bray_knn) season_bray_knn_permu = plot_permutations(gt3)+ ggtitle(paste0("p value = ", signif( gt3$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(season_bray_knn_permu) #location based gt1 = graph_perm_test(ps.expo, "geo2", distance = "jaccard", type = "mst", nperm = nperm) print(gt1$pval) location_jac_mst = plot_test_network(gt1) + theme(legend.text=element_text(size=basesize)) print(location_jac_mst) location_jac_mst_permu = plot_permutations(gt1)+ ggtitle(paste0("p value = ", signif( gt1$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(location_jac_mst_permu) gt2 = graph_perm_test(ps.expo, "geo2", distance = "bray", type = "mst" , nperm = nperm) print(gt2$pval) location_bray_mst = plot_test_network(gt2) + theme(legend.text=element_text(size=basesize)) print(location_bray_mst) location_bray_mst_permu = plot_permutations(gt2)+ ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(location_bray_mst_permu) gt3 = graph_perm_test(ps.expo, "geo2", distance = "bray", type = "knn", nperm = nperm) print(gt3$pval) location_bray_knn = plot_test_network(gt3) + theme(legend.text=element_text(size=basesize)) print(location_bray_knn) location_bray_knn_permu = plot_permutations(gt3) + ggtitle(paste0("p value = ", signif( gt3$pval, digits = 4))) + theme_pubr(base_size = basesize) print(location_bray_knn_permu) #location based geo3 inland and coastal gt1 = graph_perm_test(ps.expo, "geo3", distance = "bray", type = "mst", nperm = nperm) print(gt1$pval) coastal_bray_mst = plot_test_network(gt1) + theme(legend.text=element_text(size=basesize)) print(coastal_bray_mst) coastal_bray_mst_permu = plot_permutations(gt1) + ggtitle(paste0("p value = ", signif( gt1$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(coastal_bray_mst_permu) gt2 = graph_perm_test(ps.expo, "geo3", distance = "bray", type = "knn", nperm = nperm, knn = 1) print(gt2$pval) coastal_bray_knn = plot_test_network(gt2) + theme(legend.text=element_text(size=basesize)) print(coastal_bray_knn) coastal_bray_knn_permu = plot_permutations(gt2) + ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(coastal_bray_knn_permu) #locaiton based country gt1 = graph_perm_test(ps.expo, "country", distance = "bray", type = "mst", nperm = nperm) print(gt1$pval) country_bray_mst = plot_test_network(gt1) + theme(legend.text=element_text(size=basesize)) print(country_bray_mst) country_bray_mst_permu = plot_permutations(gt1) + ggtitle(paste0("p value = ", signif( gt1$pval, digits = 4)))+ theme(legend.text=element_text(size=basesize)) print(country_bray_mst_permu) gt2 = graph_perm_test(ps.expo, "country", distance = "bray", type = "knn", nperm = nperm, knn = 1) print(gt2$pval) country_bray_knn = plot_test_network(gt2) + theme(legend.text=element_text(size=basesize)) print(country_bray_knn) country_bray_knn_permu = plot_permutations(gt2) + ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4))) + theme(legend.text=element_text(size=basesize)) print(country_bray_knn_permu) #batch gt1 = graph_perm_test(ps.expo, "batch", distance = "bray", type = "mst", nperm = nperm) print(gt1$pval) batch_bray_mst = plot_test_network(gt1) + theme(legend.text=element_text(size=basesize)) print(batch_bray_mst) batch_bray_mst_permu = plot_permutations(gt1) + ggtitle(paste0("p value = ", signif( gt1$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(batch_bray_mst_permu) gt2 = graph_perm_test(ps.expo, "batch", distance = "bray", type = "knn", nperm = nperm, knn = 1) print(gt2$pval) batch_bray_knn = plot_test_network(gt2) + theme(legend.text=element_text(size=basesize)) print(batch_bray_knn) batch_bray_knn_permu = plot_permutations(gt2) + ggtitle(paste0("p value = ", signif( gt2$pval, digits = 4)))+ theme_pubr(base_size = basesize) print(batch_bray_knn_permu) return(list = list(owner_ja_mst, owner_ja_mst_permu, owner_bray_mst, owner_bray_mst_permu, owner_bray_knn, owner_bray_knn_permu, season_jac_mst, season_jac_mst_permu, season_bray_mst, season_bray_mst_permu, season_bray_knn, season_bray_knn_permu, location_jac_mst, location_jac_mst_permu, location_bray_mst, location_bray_mst_permu, location_bray_knn, location_bray_knn_permu, coastal_bray_mst, coastal_bray_mst_permu, coastal_bray_knn, coastal_bray_knn_permu, country_bray_mst, country_bray_mst_permu, country_bray_knn, country_bray_knn_permu, batch_bray_mst, batch_bray_mst_permu, batch_bray_knn, batch_bray_knn_permu)) } #all samples graph.output2 = graph_comp(scores_single_complete, nperm = 9999) #using only P1 graph.output3 = graph_comp(scores_single_complete[scores_single_complete$aownership %in% c("P1"),], nperm=9999) #using only P1 P2 and P3 now profile = scores_single_complete[scores_single_complete$aownership %in% c("P1", "P2", "P3"),] graph.output = graph_comp(profile, nperm = 9999) graph.output.four = graph_comp(fourpeople.select, nperm = 9999) #using only P1 at westcoast profile.p1.westcoast = subset(scores_single_complete, aownership == "P1" & geo2 == "Westcoast") graph.output.p1.westcoast = graph_comp(profile.p1.westcoast, nperm = 9999) #using only P1 at campus profile.p1.campus = subset(scores_single_complete, aownership == "P1" & location %in% c("Campus-home", "Campus", "Hiking_weekend","Hiking _weekend","Mike_office-background", "Mike_home-background", "Mike_background" )) graph.output.p1.campus = graph_comp(profile.p1.campus, nperm = 9999) ``` #data submission form ```{r data submission dataframe} # mandatory attributes sample_name, bioproject_accession, organism, altitude, collection_date, env_biome (air filter), env_feature (air filter), # env_material (air), geo_loc_name, lat_lon #For DNA samples, including all controls formalname.df = data.frame(formalname, oldname, Filter_No=filter_list) formalname.df.single = formalname.df[!grepl("_2|_3", formalname.df$formalname),] formalname.df.single = formalname.df.single[!formalname.df.single$formalname=="",] environ.data = join(environ, formalname.df.single, by = "Filter_No") submission.data = environ.data[,c("formalname", "oldname", "elevation.m.", "date.end", "city", "state", "country", "longitude", "latitude")] submission.data$bioproject_accession = rep("PRJNA421162", 312) submission.data$organism = c(rep("multiple", 283), rep("NA", 29)) submission.data$env_biome = c(rep("air filter", 283), rep("NA", 29)) submission.data$env_feature = c(rep("air filter", 283), rep("NA", 29)) submission.data$env_material = c(rep("air", 283), rep("NA", 29)) #for location submission.data$geo_loc_name = paste0(submission.data$country,"_", submission.data$state, "_", submission.data$city) #for lat_lon submission.data$lat_lon = paste0(submission.data$latitude, " ", submission.data$longitude) colnames(submission.data) # write_csv(submission.data[, c(1, 10, 11, 3, 4, 12, 13, 14, 15, 16, 2)], "~/Documents/Expo_paper/data_submission/DNAsamples.csv") write_csv(submission.data[, c(1, 10, 11, 3, 4, 12, 13, 14, 15, 16, 2)], "~/Documents/Expo_paper/data_submission/RNAsamples.csv") ``` #Spatial analysis basic ```{r spatial analysis} library(ape) tr100 = seq(1:100) tr100.do = dist(tr100) tr100.doI = 1/tr100.do tr100.d = dist(tr100) thresh = 1 #truncation to threshold 1 tr100.d[tr100.d > thresh] = 4 * thresh #PCoA on truncated matrix tr100.PCoA = cmdscale(tr100.d, eig=TRUE, k = length(tr100)-1) nb.ev = length(which(tr100.PCoA$eig > 0.0000001)) tr100.PCNM = tr100.PCoA$points[, 1:nb.ev] # tr100.PCoA$points, each column is a PCoA component; each row is a sample #plot some PCNM variable smodeeling positive spatial correlation par(mfrow = c(4,2)) somePCNM = c(1:50) for (i in somePCNM){ plot(tr100.PCNM[,i], main = paste0(i, " PCNM")) } #testing Moran's I somePCNM = 1:67 observed = c() expected = c() pvalue = c() for (i in 1:length(somePCNM)){ plot(tr100.PCNM[, somePCNM[i]], type="l", ylab = c("PCNM", somePCNM[i])) print(paste0("factor ", i)) I = Moran.I(tr100.PCNM[, somePCNM[i]], as.matrix(tr100.doI)) observed = c(observed, I$observed) expected = c(expected, I$expected) pvalue = c(pvalue, I$p.value) } plot(observed) plot(expected) plot(p.adjust(pvalue, method = "fdr")) plot(p.adjust(pvalue, method = "fdr")-0.05) #based the weight matrix I used, there are only 34 PCNM vairables with a Moran's I larger than E(I) sum(observed - expected > 0 & p.adjust(pvalue, method = "fdr")<0.05) # only 27 of them have an adjusted p value less than 0.05. So these are the selected PCNM variables. They model positive spatial correlation # two dimensional sampling: equispaced grid, this is typical for actual distances on map # The truncation distance is et to 1. could also be chosen to be the diagonal distance within a small square of 4 points which is sqrt(2) xygrid2 = expand.grid(1:20, 1:20) #creates a data matrix with 400 points to describe a "plane", all equally spaced to each other xygrid2.d = dist(xygrid2) thresh = 1 #trucation to threshold xygrid2.d[xygrid2.d > thresh] = 4 * thresh #PCoA on truncated matrix xygrid2.pcoa = cmdscale(xygrid2.d, eig=TRUE, k=nrow(xygrid2)-1) #k is the number of componet retained default to 2 #count how many PCNM variables are positive nb.ev2 = sum(xygrid2.pcoa$eig > 0.0000001) #the number of PCNM variables that are bigger than 0 xygrid2.PCNM = xygrid2.pcoa$points[, 1:nb.ev2]#matrix of PCNM variables (principal component equivalent), columns are PCNM variables, rows are samples (Sites) #calculating Moran.I for each PCNM varaibles against distance weight matrix (using inverse of distance matrix). There are other ways of getting distance weight #testing Moran's I somePCNM = 1:nb.ev2 observed = c() expected = c() pvalue = c() for (i in 1:length(somePCNM)){ s.value(xygrid2, xygrid2.PCNM[, somePCNM[i]], method = "greylevel", csize=0.35, sub=somePCNM[i], csub=2) print(paste0("factor ", i)) I = Moran.I(xygrid2.PCNM[, somePCNM[i]], as.matrix(1/xygrid2.d)) observed = c(observed, I$observed) expected = c(expected, I$expected) pvalue = c(pvalue, I$p.value) } sum(observed - expected > 0 & p.adjust(pvalue, method = "fdr")<0.05) #Moran's I larger than E(I) and also adjusted.p <0.05, 171 PCNM variables #using vegan's function and more sophisticated approach to find the distance threhsold xygrid2.pcnm.vegan = pcnm(dist(xygrid2)) #$weights is the distance weights matrix $threshold is the distance threshold # $values are the eigen values $vectors are the PCNM variables. #see above for manual caclutating of PCNM variables. xygrid2.pcnm.pos = xygrid2.pcnm.vegan$vectors somePCNM = 1:ncol(xygrid2.pcnm.pos) observed = c() expected = c() pvalue = c() for (i in 1:length(somePCNM)){ s.value(xygrid2, xygrid2.pcnm.pos[, somePCNM[i]], method = "greylevel", csize=0.35, sub=somePCNM[i], csub=2) print(paste0("factor ", i)) I = Moran.I(xygrid2.pcnm.pos[, somePCNM[i]], as.matrix(1/xygrid2.d)) observed = c(observed, I$observed) expected = c(expected, I$expected) pvalue = c(pvalue, I$p.value) } sum(observed - expected > 0 & p.adjust(pvalue, method = "fdr")<0.05) #Moran's I larger than E(I) and also adjusted.p <0.05, #171 positively correlated PCNM variables, and significant #how to use moran.I function ozone <- read.table("https://stats.idre.ucla.edu/stat/r/faq/ozone.csv", sep=",", header=T) head(ozone, n=10) ozone.dists <- as.matrix(dist(cbind(ozone$Lon, ozone$Lat))) ozone.dists.inv <- 1/ozone.dists diag(ozone.dists.inv) <- 0 ozone.dists.inv[1:5, 1:5] tempI = Moran.I(ozone$Av8top, ozone.dists.inv) ``` ##Spatial analysis tutorial PCNM approach ```{r spatial tutorial} library(adespatial) data(mite) data(mite.env) data(mite.pcnm) data(mite.xy) mite.h = decostand(mite, "hellinger") mite.xy.c = scale(mite.xy, center=TRUE, scale=FALSE) mite.h.det = resid(lm(as.matrix(mite.h) ~ ., data=mite.xy)) #mite.h.D1 = dist(mite.h.det) mite.h.D1 = vegdist(mite.h.det,method = "euclidean") mite.correlog = mantel.correlog(mite.h.D1, XY = mite.xy, nperm=99) summary(mite.correlog) plot(mite.correlog) #number of classes mite.correlog$n.class #break points, for distance classes mite.correlog$break.pts #number of tested needed for fdr adjustements mite.correlog$n.tests mite.pcnm = vegan::pcnm(dist(mite.xy)) mite.pcnm.pos = mite.pcnm$vectors #vectors are automatically with plus eigenvalues #get the pcnm variables with significant positive spatial correlation somePCNM = 1:ncol(mite.pcnm.pos) observed = c() expected = c() pvalue = c() for (i in 1:length(somePCNM)){ #s.value(dist(mite.xy), mite.pcnm.pos[, somePCNM[i]], method = "greylevel", csize=0.35, sub=somePCNM[i], csub=2) print(paste0("factor ", i)) I = Moran.I(mite.pcnm.pos[, somePCNM[i]], as.matrix(1/dist(mite.xy))) observed = c(observed, I$observed) expected = c(expected, I$expected) pvalue = c(pvalue, I$p.value) } sum(observed - expected > 0) sum(observed - expected > 0 & p.adjust(pvalue, method = "fdr")<0.05) mite.pcnm.pos.select = mite.pcnm.pos[, ((observed - expected) > 0 & p.adjust(pvalue, method = "fdr")< 0.05)] #variation partitioning using XY cooordinates, PCNM variables, and environmental variables #1 forward selection of coordinates mite.XY.rda = rda(mite.h, mite.xy) anova.cca(mite.XY.rda) #model is significant, so forward selection is carried out mite.XY.R2a = RsquareAdj(mite.XY.rda)$adj.r.squared #any new model can't be better than full model mite.XY.fwd = forward.sel(mite.h, as.matrix(mite.xy), adjR2thresh = mite.XY.R2a) #there are only two variables x,y, and the function selects y! XY.sign = sort(mite.XY.fwd$order) #there is only one vairable here XY.red = mite.xy[, c(XY.sign)] #retained XY data #2 forward selection of environmental variables substrate = model.matrix(~mite.env[,3])[,-1] #substrate recoding shrubs = model.matrix(~mite.env[,4])[,-1] #shrubs recoding topc = model.matrix(~mite.env[,5])[,-1] #topc mite.env2 = cbind.data.frame(mite.env[, 1:2], substrate, shrubs, topc) #forward selection of the enviromental variables mite.env.rda = rda(mite.h, mite.env2) #using the full model, to get the maximum Rsqare mite.env.R2a = RsquareAdj(mite.env.rda)$adj.r.squared #then get the adjusted Rsquare mite.env.fwd = forward.sel(mite.h, mite.env2, adjR2thresh = mite.env.R2a, nperm = 9999) #forward selection on enviromental parameters, using adjusted Rsquare as the max rsqare possible (can't be better than original model) env.sign = sort(mite.env.fwd$order) env.red = mite.env2[, c(env.sign)] #retained env data colnames(env.red) #retained variable after forward seleciton #3 pncm variables mite.undet.pcnm.rda = rda(mite.h, mite.pcnm.pos.select) anova.cca(mite.undet.pcnm.rda) mite.undet.pcnm.R2a = RsquareAdj(mite.undet.pcnm.rda)$adj.r.squared mite.undet.pcnm.fwd = forward.sel(mite.h, mite.pcnm.pos.select, adjR2thresh = mite.undet.pcnm.R2a) nrow(mite.undet.pcnm.fwd) #number of selected pcnm variables pcnm.sign = sort(mite.undet.pcnm.fwd$order) pcnm.red = mite.pcnm.pos[, c(pcnm.sign)] #retained pcnm data #visualizing retained pcnm variables to see which are broad, which are fine for (i in 1:ncol(pcnm.red)){ s.value(mite.xy, pcnm.red[,i], sub=pcnm.sign[i], csub=2) } #4 assigning pcnm scale based on visualization pcnm.broad = pcnm.red[, 1:5] pcnm.fine = pcnm.red[, 6:10] #5 mite-enviroment - XY coordinates - PCNM vairation partitioning mite.varpart = varpart(mite.h, env.red, XY.red, pcnm.broad, pcnm.fine) # four is the maximum group par(mfrow=c(1,2)) showvarparts(4) plot(mite.varpart, digits=2) #tests of the unique fractions [a], [b], [c], [d] #********************** #the cbind() function is to create the data.frame for conditioned variables (partialled out) #fraction [a] pure environmental anova.cca(rda(mite.h, env.red, cbind(XY.red, pcnm.broad, pcnm.fine))) #fraction [b] pure tred anova.cca(rda(mite.h, XY.red, cbind(env.red, pcnm.broad, pcnm.fine))) #fraction [c] pure broad scale spatial anova.cca(rda(mite.h, pcnm.broad, cbind(env.red, XY.red, pcnm.fine))) #fraction [d] pure fine scale spatial anova.cca(rda(mite.h, pcnm.fine, cbind(env.red, XY.red, pcnm.broad))) #although tutorial mentions only a and c are significant, my results show they are all significant, the difference may be how I selected the positived correlated and significant pcnm variables (didn't use PCNM package) #whether or not each group is significant can also be seen investigating the varpart result directly. (mite.varpart) ``` ##Spatial analysis MEM ```{r spatial analysis MEM} library(spacemakeR) library(spdep) library(adespatial) library(ape) data(mite) data(mite.env) data(mite.pcnm) data(mite.xy) mite.del = tri2nb(mite.xy) #select the best model based on AICc value mite.del.res = test.W(mite.h.det, mite.del) # AICc NbVar #1 -94.18844 7 #summary of the best model summary(mite.del.res$best) R2.del = mite.del.res$best$R2[which.min(mite.del.res$best$AICc)] #best model is with the lowest AICc #adjusted R^2 for the best model (n=70, and m =7) RsquareAdj(R2.del, 70, 7) #0.1989829 #second class of models #weighting function f2 = function(D, dmax, y)(1 - (D/dmax)^y) #more complicated distance function max.d = max(unlist(nbdists(mite.del, as.matrix(mite.xy)))) #power is set from 2 to 10 mite.del.f2 = test.W(mite.h.det, mite.del, f=f2, y=2:10, dmax=max.d, xy=as.matrix(mite.xy)) #AICc for the null model: -87.47112034786 #Best spatial model: # y dmax AICc NbVar #1 2 3 -95.44519 6 #unadjusted R^2 for the best model in the second class R2.delW = mite.del.f2$best$R2[which.min(mite.del.f2$best$AICc)] RsquareAdj(R2.delW, 70, 6) #0.1969499 #Third class of models. connectivity matrix based on a distance (raidus around points) mite.vario = variogmultiv(mite.h.det, mite.xy, nclass=20) plot(mite.vario$d, mite.vario$var, ty='b', pch=20, xlab="Distance", ylab="C(distance)") #construction of 10 neighborhood matrices, using first peak of mite.vario plot thresh10 = seq(give.thresh(dist(mite.xy)), 4, le = 10) #give.thresh(dist(mite.xy)) gives the minimum threshold distance needed for all sites to be connected, anything aboves this ensures it happens. le=10 means seq() outputs 10 values from the range #create 10 model matrices list10nb = lapply(thresh10, dnearneigh, x=as.matrix(mite.xy), d1=0) #display an excerpt of the first neighbourhood matrix print(listw2mat(nb2listw(list10nb[[1]]))[1:10, 1:10], digits =1) #testing on all 10 model matrices mite.thresh.res = lapply(list10nb, test.W, Y=mite.h.det) mite.thresh.minAIC = sapply(mite.thresh.res, function(x){ min(x$best$AICc, na.rm=TRUE)}) min(mite.thresh.minAIC) #lowest AICc thresh10[which.min(mite.thresh.minAIC)] #new distance threshold to use #model selected is with 5 MEM variables, with a AICc of -100.6 #Fourth class, same as third clas, but change distance function to f2 (see above for f2 definition), this produced the best model mite.thresh.f2 = lapply(list10nb, function(x) {test.W(x, Y=mite.h.det, f=f2, y=2:10, dmax=max(unlist(nbdists(x, as.matrix(mite.xy)))), xy=as.matrix(mite.xy))}) mite.f2.minAIC = sapply(mite.thresh.f2, function(x) min(x$best$AICc, na.rm=TRUE)) min(mite.f2.minAIC) # best model nb.bestmod = which.min(mite.f2.minAIC) #6 MEM variables dmax.best = mite.thresh.f2[nb.bestmod][[1]]$all[1,2] #2.668333 is the distance threshold chosen #extract the best model mite.MEM.champ = unlist(mite.thresh.f2[which.min(mite.f2.minAIC)], recursive = FALSE) summary(mite.MEM.champ) mite.MEM.champ$best$values #eigenvalues mite.MEM.champ$best$ord #order of MEM variables by added R2 MEMid = mite.MEM.champ$best$ord[1:which.min(mite.MEM.champ$best$AICc)] sort(MEMid) MEM.all = mite.MEM.champ$best$vectors MEM.select = mite.MEM.champ$best$vectors[, sort(c(MEMid))] #unadjusted R2 of the best model R2.MEMbest = mite.MEM.champ$best$R2[which.min(mite.MEM.champ$best$AICc)] #adkisted R2 pf best ,pde; RsquareAdj(R2.MEMbest, nrow(mite.h.det), length(MEMid)) #0.2906696, much higher than before #Maps of the 7 significant MEM variables for(i in 1:ncol(MEM.select)){ s.value(mite.xy, MEM.select[,i], sub=sort(MEMid)[i], csub=2)} #RDA of the mite data constratined by the 7 MEM vraibles mite.MEM.rda = rda(mite.h.det~., as.data.frame(MEM.select)) mite.MEM.R2a = RsquareAdj(mite.MEM.rda)$adj.r.squared anova.cca(mite.MEM.rda) axes.MEM.test = anova.cca(mite.MEM.rda, by="axis") #axis is MEM variable here #number of significant axes nb.ax = length(which(maxes.MEM.test[,5] <= 0.05)) #plot maps of the two significant canonical axes mite.MEM.axes = scores(mite.MEM.rda, choices = c(1,2), display="lc", scaling=1) par(mfrow=c(1,2)) s.value(mite.xy, mite.MEM.axes[,1]) s.value(mite.xy, mite.MEM.axes[,2]) ``` #multicomparison using KW or Friedman tests and the following pair-wise ad-hoc tests ```{r multicomparison} library(PMCMRplus) data(InsectSprays) kruskal.test(count ~ spray, data=InsectSprays) #posthoc.kruskal.nemenyi.test(count ~ spray, dist="Tukey", data = InsectSprays, p.adjust.method = "fdr") posthoc.kruskal.dunn.test(count ~ spray, dist="Tukey", data = InsectSprays, p.adjust.method = "fdr") #this one is standard adhoc after KW test posthoc.kruskal.conover.test(count ~ spray, dist="Tukey", data = InsectSprays, p.adjust.method = "fdr") #this one is the high power method adhoc test after KW test ``` # MISC stuff ```{r misc} temp = read.table("~/Documents/Bioinfo/places/median_of_all_Mike_data_sickdays_weather_events.txt", stringsAsFactors = FALSE) write_csv(temp, "~/Documents/Bioinfo/places/Mikemetadata.csv") #color checking functions #default colors, for barplots colors k = ggplot(data.frame(group=c(1:6), value=c(1:6)), aes(x=group, y=value, color=as.factor(group))) + geom_point(size=3) k ggplot_build(k)$data ggplot_build(k)$data[[1]]$colour #npg colors k = ggplot(data.frame(group=c(1:10), value=c(1:10)), aes(x=group, y=value, color=as.factor(group))) + geom_point(size=3) + scale_color_npg() k ggplot_build(k)$data ggplot_build(k)$data[[1]]$colour #nejm colors k = ggplot(data.frame(group=1:8, value=1:8), aes(x=group, y=value, color=as.factor(group))) + geom_point(size=3) + scale_color_nejm() print(k) ggplot_build(k)$data ggplot_build(k)$data[[1]]$colour #lancet colors k = ggplot(data.frame(group=1:8, value=1:8), aes(x=group, y=value, color=as.factor(group))) + geom_point(size=3) + scale_color_lancet() print(k) ggplot_build(k)$data ggplot_build(k)$data[[1]]$colour master_master.numeric = Filter(is.numeric, master_master) master_master.numeric = as.matrix(master_master.numeric) #total correlation res2 = rcorr(master_master.numeric) res2$r[is.na(res2$r)] = 0 #this get rids of the no correlation problems !!! corrplot::corrplot(res2$r, order = "hclust", p.mat = res2$P, sig.level = 0.05, insig = "blank", tl.cex=0.6, type="lower") corrplot::corrplot(res2$r, p.mat = res2$P, sig.level = 0.05, insig = "blank", tl.cex=0.6, type="lower") #mean -variance relationship for all taxons mv.df = data.frame(mean=apply(master.asinh.cpm.single, 1, mean), variance=apply(master.asinh.cpm.single, 1, var)) ggscatter(mv.df, x = "mean", y = "variance", color = "black", shape = 21, size = 4, fill="light grey", # Points color, shape and size add = "reg.line", # Add regressin line add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line conf.int = TRUE, # Add confidence interval cor.coef = TRUE, # Add correlation coefficient title = paste0(mole.type, "-", taxonlevel, " mean-variance relationship") ) output.df = data.frame(ownership = environ.no3$ownership, date.start = environ.no3$date.start, date.end = environ.no3$date.end, location = environ.no3$location) write.csv(output.df, "~/Documents/Bioinfo/p3.csv", quote = FALSE, row.names =FALSE) co.DNA.species = read.csv("~/Documents/Bioinfo/localanalysis/DNA_mega.SPECIES.csv", header = TRUE, stringsAsFactors = FALSE) #assuming DNA species data is loaded, this checks how many DNA species are also identified in co-assemble approach. 100% !! ALL OF THEM CAN BE FOUND sum(row.names(master.asinh.cpm) %in% co.DNA.species$taxon) #find more than distinct 20 colors n <- 25 qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',] col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))) pie(rep(1,n), col=sample(col_vector, n)) #find reads number for DNA and RNA dnareads = read.table("~/Documents/Expo_paper/DNAreads.txt", header=FALSE) rnareads = read.table("~/Documents/Expo_paper/RNAreads.txt", header=FALSE) median(dnareads$V1[dnareads$V1 >= 20000000])#some testing samples included median(rnareads$V1[rnareads$V1 >= 20000000]) sum(as.numeric(dnareads[dnareads>=20000000,])) sum(as.numeric(rnareads[rnareads>=20000000,])) sum(as.numeric(dnareads[dnareads>=20000000,])) * 150 sum(as.numeric(rnareads[rnareads>=20000000,])) * 150 reads.df = data.frame(count = as.numeric(c(dnareads$V1[dnareads$V1>=20000000], rnareads$V1[rnareads$V1>=20000000])), type = c(rep("DNA", length(dnareads$V1[dnareads$V1>=20000000])), rep("RNA", length(rnareads$V1[rnareads$V1>=20000000])))) ggdensity(reads.df, x = "count", add = "median", #add.params=list(color="red"), rug = TRUE, color = "type", fill = "type", palette = c("#00AFBB", "#E7B800", "#3C5488FF"), ) + labs(x="Reads counts per sample") + theme_pubr(base_size = 16) #RNA amplification result RNAamp = read.csv("~/Documents/Expo_paper/RNAamp_result.csv", header = TRUE, stringsAsFactors = FALSE) colnames(RNAamp) = c("Filter", "Concentration", "Type") RNAamp$Type = factor(RNAamp$Type, levels = c("Control", "Sample")) plott = ggplot(RNAamp, aes(x=Type, y=log(Concentration), fill = Type, group = Type)) + geom_boxplot() + geom_beeswarm(color = "black", size = 0.5, alpha = 0.7) + theme_pubr(base_size = 14) + stat_compare_means() environ$batchinfo = paste0("B",environ$batch) for (item in unique(environ$batchinfo)){ print(item) temp.mean = environ$cDNA_conc[environ$batchinfo == item & environ$Filter_No != item] print(mean(temp.mean, na.rm=TRUE)) temp.blank = environ$cDNA_conc[environ$Filter_No == item] print(temp.blank) print(mean(temp.mean, na.rm=TRUE)/temp.blank) } #quickling getting specific samples which(scores_single_complete$aownership == "P2") #and how many length(which(scores_single_complete$aownership == "P2")) ```