library(dplyr)
library(phyloseq)
# devtools::install_github("jsilve24/stray")
library(stray)
# devtools::install_github("jsilve24/driver")
library(driver)
source("taxaLevelCollapser_v5.R")


otu <- read.csv("pdfu_16s_OTU_counts_METABOLOMICS.2.csv", sep = ";", header = TRUE,
                row.names = 1, strip.white = TRUE)

tax <- read.csv("pdfu_16s_taxonomy.csv", sep = ",", header=TRUE,
                row.names = 1, strip.white = TRUE)

sampledata <- read.csv("Metabolomics_metadata_ready.4.csv", sep = ",", dec = ".", 
                       header = TRUE, row.names = 1,
                       strip.white = TRUE)


sampledata.2 <- sampledata


otu.2 <- t(otu)

otu.matrix <- as.matrix(otu.2)
tax.matrix <- as.matrix(tax)

dim(otu.2)
dim(tax)


otu.table <- otu_table(otu.matrix, taxa_are_rows = TRUE)
tax.table <- tax_table(tax.matrix)
sampledata.3 <- sample_data(sampledata.2)

dim(sampledata.3)


# 124 samples total:
physeqfinal <- phyloseq(otu.table, tax.table, sampledata.3)
physeqfinal

sample_data(physeqfinal)
str(sample_data(physeqfinal))
str(sampledata.2)


### Get only the CONTROL samples:
table(sample_data(physeqfinal)$Group)
physeqfinal <- subset_samples(physeqfinal, Group == "C")
table(sample_data(physeqfinal)$Group)
sample_data(physeqfinal)


######### METABOLITES:

# We are only using those metabolites that were diferentially abundant between C and PD (i.e. "selected").
# We randomize the order of the metabolites (i.e. columns) to make sure that no biases are introduced by
# using the metabolites in the original order of the table (they have different mass spectrometry sources).
# We will save that object (to be on the safe side):

#set.seed(666)
#head(GC)
#GC.2 <- GC[ ,sample(ncol(GC))]
#head(GC.2)
#write.table(GC.2, file = "GC.rand.selected.tsv", col.names = NA)
# Then separate the object into PD or Controls, as desired. I already had those files prepared,
# so I don't do that here; I just load up a ready-made file. Files are available upon request, including RAW.


GC.2 <- read.csv("GC.rand.selected.CONTROLS_ONLY.csv", sep = "", dec = ".", header = TRUE,
               row.names = 1, strip.white = TRUE)


#### Log 10 Transform:
any(GC.2 == 0)
GC.log <- log10(1 + GC.2)
GC.log


#### Z-transform to put all combined MS datasets in the same scale:
GC.log.Z <- as.data.frame(apply(GC.log, 2, scale))
GC.log.Z
colMeans(GC.log.Z)
sapply(GC.log.Z, sd)

rownames(GC.log.Z)
rownames(GC.log.Z) <- rownames(GC.2)

GC.3 <- t(GC.log.Z)
GC.4 <- as.matrix(GC.3)
rownames(GC.4)
GC.4


######### Back to TAXA.
# The script for collapsing taxonomic levels was writen by Dr Velma Aho.
# After collapsing the taxonomic levels, amalgamate those TAXA
# that don’t pass filtering to a category called “other”
# to maintain the proper variance in the multinomial model.
# Choose the right option for the desired taxonomic level:

physeqfinal.2 <-collapseTaxLevel(physeqfinal, level="Genus")
physeqfinal.2
physeqfinal
physeqfinal.3 <- physeqfinal.2

keep_ix <- taxa_sums(physeqfinal.3) > 4
keep_ix <- keep_ix & (rowSums(otu_table(physeqfinal.3)>2)>3)
physeqfinal.3 <- merge_taxa(physeqfinal.3, taxa_names(physeqfinal.3)[!keep_ix])
nms <- taxa_names(physeqfinal.3)
rnm <- which(taxa_names(physeqfinal.3) == taxa_names(physeqfinal.3)[!keep_ix][1])
nms[rnm] <- "other"
taxa_names(physeqfinal.3) <- nms
rm(nms, rnm)

physeqfinal
physeqfinal.2
physeqfinal.3


Y <- otu_table(physeqfinal.3, taxa_are_rows = TRUE)

Z <- GC.4

sample_dat <- as.data.frame(as(sample_data(physeqfinal.3), "matrix"))


# We use a model with intercept only:
X.check.2 <- model.matrix(~ 1, data = sample_dat)
X.check.2

X <- t(model.matrix(~ 1, data = sampledata.ref))

# save dimensions for easy reference:
N <- ncol(Y) 
D <- nrow(Y)
P <- nrow(Z)
Q <- nrow(X)

dim(Y)
dim(Z)
dim(X)


# Priors (see our paper for an explanation of what we are doing here and why):

upsilon <- (D-1+P)+10
Xi <- diag(D-1+P)
GG <- cbind(diag(D-1), -1)
Xi[1:(D-1), 1:(D-1)] <- GG %*% diag(D) %*% t(GG)
Xi <- Xi * (upsilon-D-P)

Gamma <- diag(Q)

Theta <- matrix(0, D-1+P, 1)

dim(Gamma)
dim(Theta)


### Fitting the model:

set.seed(666)


fit <- orthus(Y, Z, X, Theta = Theta, Gamma = Gamma, Xi = Xi, upsilon = upsilon,
              calcGradHess = FALSE,
              multDirichletBoot = .5,
              n_samples = 2000,
              verbose = TRUE)


fit.2 <- to_clr(fit)
#print(fit.2)


# Now get those correlations from the covariances:

xcor.2 <- fit.2$Sigma
xcor.2
is.array(xcor.2)
dim(xcor.2)


for (i in 1:dim(xcor.2)[3]){
  xcor.2[,,i] <- cov2cor(xcor.2[,,i])
}


xcor.3 <- xcor.2[1:D,(D+1):(D+P),]
xcor.3
dim(xcor.3)


xcor.summary.COR <- gather_array(xcor.3, cov, taxa, metabolite, iter) %>%
  mutate(tm = paste0(taxa, "_", metabolite)) %>% 
  mutate(taxa = rownames(Y)[taxa], metabolite = rownames(Z)[metabolite]) %>% 
  group_by(taxa, metabolite) %>% 
  summarise_posterior(cov) %>% 
  arrange(mean) %>% 
  filter(taxa != 'other')

res.cor <- xcor.summary.COR %>% 
  filter(sign(p2.5) == sign(p97.5)) %>% 
  filter(abs(mean) > 0.3) # or whatever correlation we want as a minimum

res.cor

write.table(res.cor, file = "res.coR.selected.CONTROLS_ONLY.GENUS.tsv", col.names = NA)


sessionInfo()

# R version 3.6.0 (2019-04-26)
# Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS  10.16
# 
# Matrix products: default
# LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
# 
# locale:
#   [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# 
# attached base packages:
#   [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
#   [1] driver_0.1.1    stray_0.1.13    phyloseq_1.27.6 dplyr_0.8.4    
# 
# loaded via a namespace (and not attached):
#   [1] tidyselect_1.0.0    purrr_0.3.3         reshape2_1.4.3     
# [4] splines_3.6.0       lattice_0.20-38     rhdf5_2.27.19      
# [7] colorspace_1.4-1    vctrs_0.3.1         stats4_3.6.0       
# [10] mgcv_1.8-28         survival_2.44-1.1   rlang_0.4.10       
# [13] pillar_1.4.3        glue_1.3.1          BiocGenerics_0.29.2
# [16] foreach_1.4.4       lifecycle_0.2.0     plyr_1.8.5         
# [19] stringr_1.4.0       zlibbioc_1.29.0     Biostrings_2.51.5  
# [22] munsell_0.5.0       gtable_0.3.0        coda_0.19-3        
# [25] codetools_0.2-16    forcats_0.4.0       Biobase_2.43.1     
# [28] permute_0.9-5       IRanges_2.17.5      biomformat_1.11.1  
# [31] parallel_3.6.0      Rcpp_1.0.3          arrayhelpers_1.1-0 
# [34] scales_1.1.0        vegan_2.5-4         S4Vectors_0.21.24  
# [37] jsonlite_1.6        XVector_0.23.2      tidybayes_2.0.1    
# [40] svUnit_0.7-12       ggplot2_3.2.1       stringi_1.4.5      
# [43] grid_3.6.0          ade4_1.7-13         tools_3.6.0        
# [46] magrittr_1.5        lazyeval_0.2.2      tibble_3.0.1       
# [49] cluster_2.0.8       tidyr_1.0.2         crayon_1.3.4       
# [52] ape_5.3             pkgconfig_2.0.3     MASS_7.3-51.4      
# [55] ellipsis_0.3.0      Matrix_1.2-17       data.table_1.12.6  
# [58] assertthat_0.2.1    rstudioapi_0.10     iterators_1.0.10   
# [61] Rhdf5lib_1.5.4      R6_2.4.1            multtest_2.39.0    
# [64] igraph_1.2.4.1      nlme_3.1-139        compiler_3.6.0