# The following files need to be accessable in the same directory in order
# to allow correct execution of the script:
# 1) OTU_Table_wTax_DilutionExperiment.txt
# 2) Mapping_DilutionExperiment.txt
# 3) designConc_DilutionExperiment.csv
# 4) Total16S_DilutionExperiment.csv
# 5) OTU_Table_wTax_human.txt
# 6) Mapping_human.txt
#
## ----libraries,message=FALSE,warning=FALSE,results="hide"----------------
libs<-c("ggplot2",
        "grid",
        "gridExtra",
        "reshape",
        "scales",
        "hexbin",
        "data.table",
        "extrafont")
lapply(libs,require,character.only=TRUE)

## ----ImportOTUTable,tidy=F-----------------------------------------------
#Import text formatted OTU Table as derived by biom convert:
scml.OTUs_tax<-read.table("./OTU_Table_wTax_DilutionExperiment.txt",sep="\t",
                          stringsAsFactors=F,header=TRUE,row.names=1)
#Delete taxonomy column
scml.OTUs<-scml.OTUs_tax[,-37] 

## ----CreateTaxaMap-------------------------------------------------------
TaxaMap<-data.frame(OTU=rownames(scml.OTUs),taxonomy=scml.OTUs_tax$taxonomy)

## ----ImportMetaData------------------------------------------------------
#Import MetaData Mapping File
scml.meta<-read.table("./Mapping_DilutionExperiment.txt",sep="\t",header=TRUE)
#Bring OTU table in same order like meta data in respect to sample ID
ord<-match(scml.meta$SampleID,colnames(scml.OTUs))
scml.OTUs<-scml.OTUs[,ord]

## ----DesignConcentration-------------------------------------------------
designConc<-read.table("./designConc_DilutionExperiment.csv",sep="\t",header=TRUE,
                       row.names=1)
designConc.names<-colnames(designConc)
designConc<-designConc[,-c(which(colnames(designConc)=="Sample.87"),
                           which(colnames(designConc)=="Sample.88"))]

## ----Import16SConc-------------------------------------------------------
total16S<-read.table("./Total16S_DilutionExperiment.csv",sep="\t",stringsAsFactors=F,
                     header=TRUE)
ord2<-match(scml.meta$SampleID,total16S$SampleID)
total16S<-total16S[ord2,]

## ----ExcludeMIDs---------------------------------------------------------
midToExclude<-c("MID39","MID40")
midIDX<-which(colnames(scml.OTUs)%in%midToExclude)

#Exclude MIDs from OTU table and convert to matrix:
scml.OTUs.trim<-as.matrix(scml.OTUs[,-midIDX])

#Exclude MIDs from the mapping table
scml.meta.trim<-scml.meta[-midIDX,]

#Exclude MIDs from the total 16S table
midIDX<-which(total16S$SampleID%in%midToExclude)
total16S.trim<-total16S[-midIDX,]

## ----SpikePositions------------------------------------------------------
#Search unique ID for S. Ruber:
SalPos<-grep("AF323500XXXX",rownames(scml.OTUs.trim))
#Search unique ID for R. radiobacter:
RhizPos<-grep("AB247615XXXX",rownames(scml.OTUs.trim))
#Search unique ID for A. acidiphilus:
AliPos<-grep("AB076660XXXX",rownames(scml.OTUs.trim))

SpikeCols<-c("#e41a1c","#4daf4a","#377eb8")

## ----Pseudocount---------------------------------------------------------
scml.OTUs.trim<-scml.OTUs.trim+1

## ----RelativeAbundances--------------------------------------------------
#Divide each sample vector by its sum of read counts:
relAbundance<-sweep(scml.OTUs.trim,2,colSums(scml.OTUs.trim),'/') 

## ----SCML_Salini---------------------------------------------------------
##SCML (S. Ruber)
#Extract S. Ruber reads
Salini<-scml.OTUs.trim[SalPos,]

#Calculate size factor:
sizeFactor_Salini<-Salini/mean(Salini)

#Apply size factor to each sample
scml.counts<-scml.OTUs.trim
for(i in seq(ncol(scml.counts))){
  scml.counts[,i]<-scml.counts[,i]/sizeFactor_Salini[i]
}

## ----AdjustingAAandRR----------------------------------------------------
#read out only spike OTU reads:
spikeReads.Raw<-scml.OTUs.trim[c(AliPos,SalPos,RhizPos),] 
rownames(spikeReads.Raw)<-c("A. acidiphilus","S. ruber","R. radiobacter")
#Create copy of spikeReads.Raw for inserting adjusted spike reads:
spikeReads.Adjusted<-spikeReads.Raw
#Calculate size factors depending on the ratio towards 
#S. ruber (on row 2 of designConc):
sfA<-designConc[1,]/designConc[2,]
sfR<-designConc[3,]/designConc[2,]
#For each spike-in apply the corresponding size factor
spikeReads.Adjusted[1,]<-as.numeric(spikeReads.Adjusted[1,])/as.numeric(sfA)
spikeReads.Adjusted[3,]<-as.numeric(spikeReads.Adjusted[3,])/as.numeric(sfR)

## ----SCMLCombined--------------------------------------------------------
#Sum the raw reads counts of S. ruber and the adjusted read counts of 
#A. acidiphilus and R. radiobacter for each sample:
summedSpikes<-colSums(spikeReads.Adjusted)

#Calculate size factor:
sizeFactor_SCML.combined<-summedSpikes/mean(summedSpikes)

#Apply size factor to each sample:
scml.counts.combined<-scml.OTUs.trim
for(i in seq(ncol(scml.counts.combined))){
  scml.counts.combined[,i]<-scml.counts.combined[,i]/sizeFactor_SCML.combined[i]
}

## ----qRT-PCR_normalization-----------------------------------------------
#Extract SYBR Green pGEM total 16S measurement from the total16S object:
total16S_SYBR<-t(total16S.trim[,1])
#Calculate size factor:
sizeFactor_qRT.PCR<-total16S_SYBR/mean(total16S_SYBR)

#Apply size factor to each sample:
qRTpcr.counts<-scml.OTUs.trim
for(i in seq(ncol(qRTpcr.counts))){
  qRTpcr.counts[,i]<-qRTpcr.counts[,i]*sizeFactor_qRT.PCR[i]
}

## ----ConstructLongTableForSpikeReads-------------------------------------
#Extract scml read counts for the spike bacteria:
spikeReads.SCML<-scml.counts[c(AliPos,SalPos,RhizPos),]
rownames(spikeReads.SCML)<-c("A. acidiphilus","S. ruber","R. radiobacter")
#Melt design concentration to later on add to the long tables:
mConc<-melt(as.matrix(designConc))
#Melt each read count table to long format and add a status and 
#design concentration column, describing the origin of the reads:
mSpikes.raw<-cbind(melt(as.matrix(spikeReads.Raw)),
                   Status="raw",
                   DesignConc=mConc$value)
mSpikes.adjusted<-cbind(melt(as.matrix(spikeReads.Adjusted)),
                        Status="adjusted",
                        DesignConc=mConc$value)
mSpikes.scml<-cbind(melt(as.matrix(spikeReads.SCML)),
                    Status="normalized",
                    DesignConc=mConc$value)
#Unify column names for merging:
colnames(mSpikes.raw)<-
  colnames(mSpikes.adjusted)<-
  colnames(mSpikes.scml)<-
  c("SpikeOTU","SampleID","Reads","Status","DesignConc")
#Combine all three long tables:
mSpikes<-rbind(mSpikes.raw,mSpikes.adjusted,mSpikes.scml)
#Add information on the dilution of each sample:
x<-match(mSpikes$SampleID,scml.meta.trim$SampleID)
#Round Dilution to 2 decimals
mSpikes$Dilution<-round(scml.meta.trim$Dilution,2)[x]
#Reverse the levels to create a ordering for the display of the dilution:
mSpikes$Dilution<-factor(mSpikes$Dilution,levels=rev(levels(factor(mSpikes$Dilution))))

## ----Figure1a_Part1------------------------------------------------------
#Scientific format function for x-axis:
scientific_10 <- function(x) {
    y<-2^x
    parse(text=gsub("e", " %*% 10^", scientific_format()(y)))
}
#Subset for raw reads:
rawData<-droplevels(subset(mSpikes,Status=="raw"))
#Subset for A. acidiphilus:
acidiSpike<-droplevels(subset(rawData,SpikeOTU=="A. acidiphilus"))


#Plot Figure 1a part 1:
Fig1a_p1<-ggplot(acidiSpike,aes(x=log2(DesignConc),y=log2(Reads),
                             colour=Dilution))+
      geom_line()+geom_point()+facet_wrap(~SpikeOTU)+
      theme_bw()+theme(text=element_text(size=13),
                  panel.grid.minor = element_blank(),
                  legend.position = "bottom", 
                  legend.box = "horizontal",
                  axis.text.x = element_text(angle = 90,
                                             hjust = 1,
                                             vjust=0.5),
                  plot.title = element_text(hjust = 0,size=18),
                  axis.title=element_text(size=14))+
      xlab(expression(paste("spiked-in 16S copies by design",sep="")))+
      ylab(expression(paste(log[2]," read counts",sep="")))+
      scale_x_continuous(breaks = pretty_breaks(n=5),labels = scientific_10)+
      guides(colour=guide_legend(title="stool dilution"))

plot(Fig1a_p1+ggtitle("a"))

## ----Figure1a_Part2------------------------------------------------------
#Subset for R. radiobacter:
radioSpike<-droplevels(subset(rawData,SpikeOTU=="R. radiobacter"))
#Plot Figure 1a part 2:
Fig1a_p2<-ggplot(radioSpike,aes(x=log2(DesignConc),y=log2(Reads),
                             colour=Dilution))+
      geom_line()+geom_point()+facet_wrap(~SpikeOTU)+
      theme_bw()+theme(text=element_text(size=13),
                  panel.grid.minor = element_blank(),
                  legend.position = "bottom",
                  legend.box = "horizontal",
                  axis.text.x = element_text(angle = 90,
                                             hjust = 1,
                                             vjust=0.5),#,size=12
                  plot.title = element_text(hjust = 0,size=18),
                  axis.title=element_text(size=14))+
      xlab(expression(paste("spiked-in 16S copies by design",sep="")))+
      ylab(expression(paste(log[2]," read counts",sep="")))+
      scale_x_continuous(breaks = pretty_breaks(n=5),labels = scientific_10)+
      guides(colour=guide_legend(title="stool dilution"))

plot(Fig1a_p2+ggtitle("a"))

## ----Figure1c------------------------------------------------------------
#Plot the adjusted read counts for all spike ins as function of the dilution:
Fig1b<-ggplot(droplevels(subset(mSpikes,Status=="adjusted")),
      aes(x=factor(Dilution,levels=rev(levels(Dilution))),y=log2(Reads)))+
      geom_boxplot(aes(fill=SpikeOTU))+
      theme_bw()+
      theme(text=element_text(size=13),
            panel.grid.major.x = element_blank(),
            legend.title = element_blank(),
            axis.ticks.x=element_blank(),
            plot.title = element_text(hjust = 0),
            legend.position = "bottom",
            legend.box = "horizontal")+
      xlab("stool dilution")+
      ylab(expression(paste(log[2]," read counts",sep="")))+
      scale_fill_manual(values=SpikeCols)+
      geom_vline(xintercept=c(1.5,2.5,3.5,4.5,5.5))

print(Fig1b+ggtitle("b"))

## ----CorrelationAnalysis-------------------------------------------------
adjusted.Spikes<-droplevels(subset(mSpikes,Status=="adjusted"))

AACor<-droplevels(subset(adjusted.Spikes,SpikeOTU=="A. acidiphilus"))
SRCor<-droplevels(subset(adjusted.Spikes,SpikeOTU=="S. ruber"))
RRCor<-droplevels(subset(adjusted.Spikes,SpikeOTU=="R. radiobacter"))

AAcorrTest<-cor.test(1/as.numeric(as.character(AACor$Dilution)),log2(AACor$Reads))
SRcorrTest<-cor.test(1/as.numeric(as.character(SRCor$Dilution)),log2(SRCor$Reads))
RRcorrTest<-cor.test(1/as.numeric(as.character(RRCor$Dilution)),log2(RRCor$Reads))

## ----CreateReadCountLongTableAllAproaches,eval=FALSE,include=FALSE-------
## #SCML:
## scml.m<-melt(as.matrix(scml.counts))
## scml.m$Approach<-"SCML"
## #Relative Abundance:
## relAb.m<-melt(as.matrix(relAbundance))
## relAb.m$Approach<-"relative abundance"
## #SCML combined:
## scml.combined.m<-melt(as.matrix(scml.counts))
## scml.combined.m$Approach<-"SCML combined"
## #QPCR:
## qRTpcr.m<-melt(as.matrix(qRTpcr.counts))
## qRTpcr.m$Approach<-"total16S"
## 
## mReads<-rbind(scml.m,relAb.m,scml.combined.m,qRTpcr.m)
## colnames(mReads)<-c("OTU_ID","SampleID","Counts","Approach")

## ----CreateAllPossibleCombinations---------------------------------------
#Retrieve all Combinations of Samples:
combinations<-combn(1:ncol(scml.OTUs.trim),2)
#Name Combinations:
comb.Names<-character()
for(i in seq(ncol(combinations))){
  comb.Names[i]<-paste(colnames(scml.OTUs.trim)[combinations[1,i]],
                                   "_vs_",colnames(scml.OTUs.trim)[combinations[2,i]],
                                   sep="")
}
colnames(combinations)<-comb.Names

## ----CalculateExpectedLog2Ratios-----------------------------------------
#Create dummies
expRatio_ali<-expRatio_rhiz<-numeric()
expRatio_Background<-numeric()
#Extract amount of background sample:
background<-scml.meta.trim$Background
#Extract design concentration of A. acidiphilus and R. radiobacter:
spikesExp<-designConc[c(1,3),]
rownames(spikesExp)<-c("A. acidiphilus","R. radiobacter")
#Calculate expected log 2 ratios for A. acidiphilus and R. radiobacter, 
#aswell as the expected ratio for the sample:
for(i in seq(ncol(combinations))){
  expRatio_ali[i]<-spikesExp[1,combinations[1,i]]/spikesExp[1,combinations[2,i]]
  expRatio_rhiz[i]<-spikesExp[2,combinations[1,i]]/spikesExp[2,combinations[2,i]]
  expRatio_Background[i]<-background[combinations[1,i]]/background[combinations[2,i]]
}

## ----CalculateObservedLog2Ratios-----------------------------------------
#Dummies for A. acidiphilus
obsRatio_Ali_relAb<-obsRatio_Ali_SCML<-numeric()
#Dummies for R. radiobacter
obsRatio_Rhiz_relAb<-obsRatio_Rhiz_SCML<-numeric()

#Relative abundance subsets for A. acidiphilus and R. radiobacter:
spikes_relAb<-relAbundance[c(AliPos,RhizPos),]
#Read subsets for A. acidiphilus and R. radiobacter:
spikes_SCML<-scml.counts[c(AliPos,RhizPos),]
rownames(spikes_relAb)<-rownames(spikes_SCML)<-c("Alicyclobacillus","Rhizobium")

#Calculate ratios for A. acidiphilus and R. radiobacter over all possible
#sample comparisons:
for(i in seq(ncol(combinations))){
  #Ratios based on relative abundances:
  obsRatio_Ali_relAb[i]<-
    spikes_relAb[1,combinations[1,i]]/spikes_relAb[1,combinations[2,i]]
  obsRatio_Rhiz_relAb[i]<-
    spikes_relAb[2,combinations[1,i]]/spikes_relAb[2,combinations[2,i]]
  #Ratios based on SCML:
  obsRatio_Ali_SCML[i]<-
    spikes_SCML[1,combinations[1,i]]/spikes_SCML[1,combinations[2,i]]
  obsRatio_Rhiz_SCML[i]<-
    spikes_SCML[2,combinations[1,i]]/spikes_SCML[2,combinations[2,i]]
}

#Build data.frame with all calculated ratios for A. acidiphilus:
ratiosAli<-data.frame(expRatio=expRatio_ali,
                      obsRatio_relAb=obsRatio_Ali_relAb,
                      obsRatio_SCML=obsRatio_Ali_SCML,
                      comparison=colnames(combinations))
ratiosAli$Spike<-"A. acidiphilus"

#Build data.frame with all calculated ratios for R. radiobacter:
ratiosRhiz<-data.frame(expRatio=expRatio_rhiz,
                       obsRatio_relAb=obsRatio_Rhiz_relAb,
                       obsRatio_SCML=obsRatio_Rhiz_SCML,
                       comparison=colnames(combinations))
ratiosRhiz$Spike<-"R. radiobacter"

#Combine the tables of A. acidiphilus and R. radiobacter:
spikeRatios<-rbind(ratiosAli,ratiosRhiz)

## ----Figure2a------------------------------------------------------------
Fig2a<-ggplot(spikeRatios,aes(x=log2(expRatio),y=log2(obsRatio_relAb),
                        colour=Spike,fill=Spike))+
          geom_point(position=position_jitterdodge(dodge.width=0.4,
                                                   jitter.width=0))+
          geom_abline(intercept = 0, slope = 1,col="purple")+
          theme_bw()+annotation_logticks(base=2)+
          ylab(expression(paste("observed ",log[2]," ratio",sep="")))+
          xlab(expression(paste("expected ",log[2]," ratio",sep="")))+
          xlim(c(-11,11))+ylim(c(-11,11))+
          theme(text=element_text(size=13),
                legend.position="bottom",
                plot.title = element_text(hjust = 0))+
          ggtitle("a")+scale_colour_manual(values=SpikeCols[1:2])+
          guides(colour=FALSE,fill=FALSE)+geom_smooth()
plot(Fig2a)

## ----Figure2b------------------------------------------------------------
Fig2b<-ggplot(spikeRatios,aes(x=log2(expRatio),y=log2(obsRatio_SCML),
                        colour=Spike,fill=Spike))+
          geom_point(position=position_jitterdodge(dodge.width=0.4,
                                                   jitter.width=0))+
          geom_abline(intercept = 0, slope = 1,col="purple")+
          theme_bw()+annotation_logticks(base=2)+
          ylab(expression(paste("observed ",log[2]," ratio",sep="")))+
          xlab(expression(paste("expected ",log[2]," ratio",sep="")))+
          xlim(c(-11,11))+ylim(c(-11,11))+
          theme(legend.position="bottom",
                text=element_text(size=13),
                plot.title = element_text(hjust = 0))+
          ggtitle("b")+scale_colour_manual(values=SpikeCols[1:2])+
          guides(colour=FALSE,fill=FALSE)+geom_smooth()
plot(Fig2b)

## ----ErrorSpikes---------------------------------------------------------
#calculate the error (exp vs obs):
errorSCML<-log2(spikeRatios$expRatio)-log2(spikeRatios$obsRatio_SCML)
errorRelAb<-log2(spikeRatios$expRatio)-log2(spikeRatios$obsRatio_relAb)
#Combine Data:
error.Data<-data.frame(SCML=errorSCML,relAb=errorRelAb)

#Convert to long format for ggplot usage:
meD<-melt(error.Data)
meD$Spike<-rep(as.character(spikeRatios$Spike),2)
colnames(meD)<-c("Method","Ratio","Spike")

## ----Figure2c_Part1------------------------------------------------------
#Subset for part 1 of Figure 2c:
meD.relAb<-droplevels(subset(meD,Method=="relAb"))
meD.relAb$Method<-factor(gsub("relAb","relative abundances",meD.relAb$Method))

Fig2c_1<-ggplot(meD.relAb,aes(x=factor(Method),y=Ratio))+
  geom_boxplot(aes(fill=Spike))+xlab("approach")+
  ylab(expression(paste(atop(paste("error",sep=""), "expected vs observed"))))+
  scale_fill_manual(labels=c("A.acidiphilus","R.radiobacter"),
                    values=SpikeCols[1:2])+
  theme_bw()+theme(text=element_text(size=13),
                   legend.position="bottom",
                   axis.text.x  = element_blank(),
                   axis.title.x = element_blank(),
                   axis.ticks.x = element_blank(),
                   legend.title = element_blank(),
                   legend.text=element_text(size=8),
                   plot.title = element_text(hjust = 0))+
  ggtitle("c")+ylim(c(-11,11))+facet_grid(Method~.,scales="free_x")
print(Fig2c_1)

## ----Figure2c_Part2------------------------------------------------------
#Subset for part 2 of Figure 2c:
meD.SCML<-droplevels(subset(meD,Method=="SCML"))

Fig2c_2<-ggplot(meD.SCML,aes(x=factor(Method),y=Ratio))+
  geom_boxplot(aes(fill=Spike))+xlab("approach")+
  ylab(expression(paste(atop(paste("error",sep=""), "expected vs observed"))))+
  scale_fill_manual(labels=c("A.acidiphilus","R.radiobacter"),
                    values=SpikeCols[1:2])+
  theme_bw()+theme(text=element_text(size=13),
                   legend.position="bottom",
                   axis.text.x  = element_blank(),
                   axis.title.x = element_blank(),
                   axis.ticks.x = element_blank(),
                   legend.title = element_blank(),
                   legend.text=element_text(size=8),
                   plot.title = element_text(hjust = 0))+
  ggtitle(" ")+ylim(c(-11,11))+facet_grid(Method~.,scales="free_x")+
  guides(fill=guide_legend(ncol=2,byrow=TRUE))
print(Fig2c_2)

## ----ObsRatios_Background------------------------------------------------
#SCML scaled reads (by Salini):
reads_SCML.salini<-scml.counts[-c(SalPos,RhizPos,AliPos),]
#Relative abundances:
reads_relAb<-relAbundance[-c(SalPos,RhizPos,AliPos),]
#SCML scaled reads (by all spike-ins):
reads_SCML.combined<-scml.counts.combined[-c(SalPos,RhizPos,AliPos),]
#Total 16S-rDNA normalized counts:
reads_Qpcr<-qRTpcr.counts[-c(SalPos,RhizPos,AliPos),]
#Number of OTUs
xx<-nrow(reads_SCML.salini)
#Dummies
back_ratio<-numeric()
comp<-character()
obs_SCML<-obs_RelAb<-obs_SCML.combined<-obs_QPCR<-list()
#Calculate ratios for each approach:
for(i in seq(ncol(combinations))){
  comp<-c(comp,rep(colnames(combinations)[i],xx))
  obs_SCML[[i]]<-reads_SCML.salini[,combinations[1,i]]/reads_SCML.salini[,combinations[2,i]]
  obs_RelAb[[i]]<-reads_relAb[,combinations[1,i]]/reads_relAb[,combinations[2,i]]
  obs_SCML.combined[[i]]<-reads_SCML.combined[,combinations[1,i]]/reads_SCML.combined[,combinations[2,i]]
  obs_QPCR[[i]]<-reads_Qpcr[,combinations[1,i]]/reads_Qpcr[,combinations[2,i]]
  back_ratio<-c(back_ratio,rep(background[combinations[1,i]]/background[combinations[2,i]],xx))
}
#Create name vector for the OTUs:
OTU_Names<-rep(rownames(reads_SCML.salini),ncol(combinations))
#Unlist observed ratios:
obs_SCML<-as.character(unlist(obs_SCML))
obs_RelAb<-as.character(unlist(obs_RelAb))
obs_SCML.combined<-as.character(unlist(obs_SCML.combined))
obs_QPCR<-as.character(unlist(obs_QPCR))

#Create data.frame containing all data:
df.BG<-data.frame(OTU=OTU_Names,
                              expRatio=back_ratio,
                              comparison=comp,
                              SCML=as.numeric(obs_SCML),
                              RelAb=as.numeric(obs_RelAb),
                              SCML.combined=as.numeric(obs_SCML.combined),
                              QPCR=as.numeric(obs_QPCR))

#Check correlation between log2 expected ratio and the observed ratios calculated by the 4 different methods
RelAb.corr<-cor.test(log2(df.BG$expRatio),log2(df.BG$RelAb)) #cor=0.3590464
QPCR.corr<-cor.test(log2(df.BG$expRatio),log2(df.BG$QPCR)) #cor=0.7166334 
SCML.corr<-cor.test(log2(df.BG$expRatio),log2(df.BG$SCML)) #cor=0.8334987
SCML.combined.corr<-cor.test(log2(df.BG$expRatio),log2(df.BG$SCML.combined)) #cor=0.8452189

## ----Modified_Binning_Function-------------------------------------------
hexbinIt<-function(x,y){
  ###################################
  #' @title bin the data points to hexagons 
  #'        and recalculate percentages instead of counts for each bin
  #' @param x The value to plot on the X-axis
  #' @param y The value to plot on the Y-axis
  #' @return data.frame containing the percentages, coordinates and cellIDs 
  #'         for each hexagonal bin (ggplot compatible input)
  require(hexbin)
  require(data.table)

  #Count levels of X:
  lx<-levels(factor(x))
  totalComps<-table(x)
  #Bin data into hexgons:
  hexdata<-hexbin(x,y,IDs=TRUE)
  #Create a data.table containing the mapping between the count and the cells:
  idMap<-data.table(count=hexdata@count,cellID=hexdata@cell)
  #Create a data.table containing the mapping between ratios and the cells:
  dt<-data.table(expRatio=x,obsRatio=y,cID=hexdata@cID)
  
  for (i in seq(length(idMap$cellID))){
    #actual cellID:
    cID<-idMap$cellID[i]
    
    #look for position of cellID in dt:
    idx<-which(dt$cID==cID)
    idMap$expR[i]<-expR<-dt$expRatio[idx][1]
    
    #Get total number of values for the level of expected ratio:
    cache<-totalComps[as.character(expR)]
    names(cache)<-NULL
    idMap$total[i]<-cache
    
    #Convert counts to percentage:
    hexdata@count[i]<-round(idMap$count[i]/cache*100,digits=2)
  }
  #Create final data.frame as a mapping between cellID, 
  #percentage for factor of x and the coordinates to plot the hexagons:
  hexdf <- data.frame (cID = hexdata@cell,percentage=hexdata@count,hcell2xy (hexdata)  )
  #Insert NA's for fields with percentage 0 to omit those:
  hexdf$percentage[hexdf$percentage == 0] <- NA
  #Return final ggplot compatible data.frame
  return(hexdf)
}

## ----Figure3a------------------------------------------------------------
#Bin the log2 ratios based on relative abundances:
hexFig3a<-hexbinIt(log2(df.BG$expRatio),log2(df.BG$RelAb))
#Create Figure 3a and omit bins with percentages below 0.05:
Fig3a<-ggplot(subset(hexFig3a,percentage>0.05),aes(x=x,y=y,fill=percentage))+
  geom_hex(stat="identity",colour="grey50")+
  theme_bw()+geom_abline(intercept = 0, slope = 1,col="purple")+
  annotation_logticks(base=2)+
  ylab(expression(paste("observed ",log[2]," ratio",sep="")))+
  xlab(expression(paste("expected ",log[2]," ratio",sep="")))+
  xlim(c(-5,5))+ylim(c(-10,10))+
  scale_fill_gradientn(colours=c("white","darkred"),
                       name="percentage of values\nper exp. ratio",
                       guide="colourbar",na.value="white",
                       limits=c(.05,55))+
  ggtitle("a")+theme(text=element_text(size=13),
                     legend.position="bottom",
                     axis.text.x  = element_blank(),
                     axis.title.x = element_blank(),
                     axis.ticks.x = element_blank(),
                     legend.text=element_text(size=8),
                     legend.key.width = unit(3, "line"),
                     legend.key.height = unit(1, "line"),
                     legend.title=element_text(size=8),
                     plot.title = element_text(hjust = 0))+geom_smooth()

print(Fig3a)

## ----Figure3b------------------------------------------------------------
#Bin the log2 ratios based on SCML:
hexFig3b<-hexbinIt(log2(df.BG$expRatio),log2(df.BG$SCML))
#Create Figure 3b and omit bins with percentages below 0.05:
Fig3b<-ggplot(subset(hexFig3b,percentage>0.05),aes(x=x,y=y,fill=percentage))+
  geom_hex(stat="identity",colour="grey50")+
  theme_bw()+geom_abline(intercept = 0, slope = 1,col="purple")+
  annotation_logticks(base=2)+
  ylab(expression(paste("observed ",log[2]," ratio",sep="")))+
  xlab(expression(paste("expected ",log[2]," ratio",sep="")))+
  xlim(c(-5,5))+ylim(c(-10,10))+
  scale_fill_gradientn(colours=c("white","darkred"),
                       name="percentage of values\nper exp. ratio",
                       guide="colourbar",
                       space = "Lab",
                       na.value="white",
                       limits=c(.05,55))+
  ggtitle("b")+theme(legend.position="bottom")+
                theme(text=element_text(size=13),
                      legend.text=element_text(size=8),
                      legend.key.width = unit(3, "line"),
                      legend.key.height = unit(1, "line"),
                      legend.title=element_text(size=8),
                      plot.title = element_text(hjust = 0))+geom_smooth()

print(Fig3b)

## ----ErrorBackground-----------------------------------------------------
#Calculate error between expected and observed log2 ratios:
error_SCML<-log2(df.BG$expRatio)-log2(df.BG$SCML)
error_RelAb<-log2(df.BG$expRatio)-log2(df.BG$RelAb)
error_SCML.combined<-log2(df.BG$expRatio)-log2(df.BG$SCML.combined)
error_QPCR<-log2(df.BG$expRatio)-log2(df.BG$QPCR)
#Create data.frame containing errors of all approaches:
errorData.BG<-data.frame(SCML.solo=error_SCML,
                         RelAb=error_RelAb,
                         SCML.combined=error_SCML.combined,
                         QPCR=error_QPCR)

#Melt data.frame to long format:
m.BG<-melt(errorData.BG)
#Rename for better plotting:
namesCache<-gsub("SCML.combined","SCML\n(combined)",m.BG$variable)
namesCache<-gsub("SCML.solo","SCML",namesCache)
namesCache<-gsub("QPCR","total 16S copies",namesCache)
namesCache<-gsub("RelAb","relative abundances",namesCache)
#use new nomenclature and sort levels:
m.BG$variable<-factor(namesCache,
                        levels=c("relative abundances",
                                 "total 16S copies",
                                 "SCML",
                                 "SCML\n(combined)"))

## ----Figure3c_Part1------------------------------------------------------
#Subset for relative abundances:
m.BG.c1<-droplevels(subset(m.BG,variable=="relative abundances"))

#Create Figure 3c part1:
Fig3c_1<-ggplot(m.BG.c1,aes(x=factor(variable),y=value))+
  geom_boxplot()+xlab("approach")+
  ylab(expression(paste(atop(paste("error",sep=""), "expected vs observed"))))+
  theme_bw()+ggtitle("c")+
  theme(text=element_text(size=13),
        legend.position="bottom",
        axis.text.x  = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.title = element_blank(),
        plot.title = element_text(hjust = 0))+
  facet_grid(variable~.,scales="free_x")+ylim(c(-10,10))

print(Fig3c_1)

## ----Figure3c_Part2------------------------------------------------------
#Subset for SCML:
m.BG.c2<-droplevels(subset(m.BG,variable=="SCML"))

#Create Figure 3c part2:
Fig3c_2<-ggplot(m.BG.c2,aes(x=factor(variable),y=value))+
  geom_boxplot()+xlab("approach")+
  ylab(expression(paste(atop(paste("error",sep=""), "expected vs observed"))))+
  theme_bw()+theme(text=element_text(size=13),
                   legend.position="bottom",
                   axis.text.x  = element_blank(),
                   axis.title.x = element_blank(),
                   axis.ticks.x = element_blank(),
                   legend.title = element_blank(),
                   plot.title = element_text(hjust = 0))+
  facet_grid(variable~.,scales="free_x")+ggtitle(" ")+ylim(c(-10,10))

print(Fig3c_2)

## ----Figure4a------------------------------------------------------------
Fig4a<-Fig3b

print(Fig4a)

## ----Figure4b------------------------------------------------------------
#Bin the log2 ratios based on counts normalized by total 16S copies:
hexFig4b<-hexbinIt(log2(df.BG$expRatio),log2(df.BG$QPCR))
#Create Figure 3b and omit bins with percentages below 0.05:
Fig4b<-ggplot(subset(hexFig4b,percentage>0.05),aes(x=x,y=y,fill=percentage))+
  geom_hex(stat="identity",colour="grey50")+
  theme_bw()+geom_abline(intercept = 0, slope = 1,col="purple")+
  annotation_logticks(base=2)+
  ylab(expression(paste("observed ",log[2]," ratio",sep="")))+
  xlab(expression(paste("expected ",log[2]," ratio",sep="")))+
  xlim(c(-5,5))+ylim(c(-10,10))+
  scale_fill_gradientn(colours=c("white","darkred"),
                       name="percentage of values per exp. ratio",
                       guide="colourbar",
                       space = "Lab",
                       na.value="white",
                       limits=c(.05,55))+
  ggtitle("b")+theme(text=element_text(size=13),
                     legend.position="bottom",
                     legend.text=element_text(size=8),
                     legend.key.width = unit(3, "line"),
                     legend.key.height = unit(1, "line"),
                     legend.title=element_text(size=8),
                     plot.title = element_text(hjust = 0))+geom_smooth()

print(Fig4b)

## ----Figure4c-------------------------------------------------------------
Fig4c<-ggplot(m.BG,aes(x=factor(variable),y=value))+
  geom_boxplot()+xlab("")+
  ylab(expression(paste(atop(paste("error",sep=""), "expected vs observed"))))+
  theme_bw()+theme(text=element_text(size=13),
                   legend.position="bottom",
                   axis.ticks.x = element_blank(),
                   legend.title = element_blank(),
                   plot.title = element_text(hjust = 0))

print(Fig4c)

## Calculate some statistics and summaries on the log2 differences between expected and observed for the different approaches
bgSummary<-ddply(m.BG,.(variable),function(x) summary(m.BG$value))
bgStats<-ddply(m.BG,.(variable),summarize,var=var(value),sd=sd(value),mean=mean(value),max=max(value))

## ----LoadASCTData--------------------------------------------------------
#read count table from QIIME output (converted to classic table from biom)
asct.wTax<-read.table("./OTU_Table_wTax_human.txt",sep="\t",
                      stringsAsFactors=F,header=TRUE,row.names=1)

#remove taxonomy
asct.OTUs<-asct.wTax[,-ncol(asct.wTax)]


#Import MetaData
asct.meta<-read.table("./Mapping_human.txt",sep="\t",header=TRUE)
#Bring OTU table in same order like meta data in respect to sample ID
ord<-match(asct.meta$SampleID,colnames(asct.OTUs))
asct.OTUs<-as.matrix(asct.OTUs[,ord])

## ----SpikePositions_asct-------------------------------------------------
#Search unique ID for S. Ruber:
SalPos_h<-grep("AF323500XXXX",rownames(asct.OTUs))
#Search unique ID for R. radiobacter:
RhizPos_h<-grep("AB247615XXXX",rownames(asct.OTUs))
#Search unique ID for A. acidiphilus:
AliPos_h<-grep("AB076660XXXX",rownames(asct.OTUs))

## ----SCML_Salini_asct----------------------------------------------------
##SCML (S. Ruber)
#Extract S. Ruber reads
Salini_asct<-asct.OTUs[SalPos_h,]

#Calculate size factor for later use:
sizeFactor_Salini_asct<-Salini_asct/mean(Salini_asct)

## ----CreateTaxonomyMapping_asct------------------------------------------
#Use taxonomy column from the OTU table in combination with the OTU ids:
y<-match(rownames(asct.OTUs),rownames(asct.wTax))
TaxaMap_asct<-data.frame(OTU=rownames(asct.OTUs),taxonomy=asct.wTax$taxonomy[y])

## ----BreakTaxonomyStringFunction-----------------------------------------
require(plyr)
#create column names for final table:
taxons<-c(paste("L",seq(7),sep=""))
#split the string by taxonomic levels:
tmp<-strsplit(as.character(TaxaMap_asct$taxonomy),split="; __")
#check the maximum column length and create a data frame:
tmpDF<-t(sapply(tmp, '[', seq(max(sapply(tmp, length)))))
#Subset only for the first 7 taxonomic levels:
tmpDF<-tmpDF[,seq(7)]
#Set row and column names
rownames(tmpDF)<-TaxaMap_asct$OTU
colnames(tmpDF)<-taxons
taxonomy_asct<-as.data.frame(tmpDF)

## ----EnterococcusIDX_asct------------------------------------------------
#Build a subset containing no spike reads:
OTUs_noSpikes<-asct.OTUs[-c(SalPos_h,RhizPos_h,AliPos_h),]
#Identify position of Enterococcus OTU:
ecIDX<-grep("JF896440",rownames(OTUs_noSpikes))
#Save the reads of this OTU for later use:
ecReads<-OTUs_noSpikes[ecIDX,]

## ----ConstructionPhylumReadsMatrix---------------------------------------
#Read Counts contributing to the chosen Enterococcus OTU 
#are excluded from the merge:
cache<-OTUs_noSpikes[-ecIDX,]
#Exclude all 4 excluded OTUs (3 spikes, 1 Enterococcus) 
#from the taxonomy:
taxonomy_Sub<-taxonomy_asct[rownames(cache),]
#Count and save different phyla:
Phylum<-taxonomy_Sub$L2
lvlPhyl<-levels(Phylum)
#Create dummy matrix:
PhylReads<-matrix(ncol=ncol(cache),nrow=length(lvlPhyl),0)
#Fill dummy matrix with read counts collapsed
#to phylum level:
for(i in seq(length(lvlPhyl))){
  idx<-which(Phylum==lvlPhyl[i])
  actOTUS<-rownames(taxonomy_Sub)[idx]
  readCache<-cache[which(rownames(cache)%in%actOTUS),]
  #Handle case were only one OTU contributes to current phylum:
  if(is.null(ncol(readCache))){
    PhylReads[i,]<-colSums(t(as.matrix(readCache)))
  }else{
    PhylReads[i,]<-colSums(readCache)
  }
}
#Set row and column names:
colnames(PhylReads)<-colnames(cache)
rownames(PhylReads)<-lvlPhyl
#Add an extra row for reads of Enterococcus:
PhylReads<-rbind(PhylReads,Enterococcus=ecReads)
#Only include Phyla which have more than 50 reads over 
#all samples:
PhylReads<-PhylReads[rowSums(PhylReads)>50,]

#Rename Firmicutes and Enterococcus to show that Enterococcus is shown separately:
rownames(PhylReads)<-gsub("Enterococcus","Genus: Enterococcus",rownames(PhylReads))
rownames(PhylReads)<-gsub("Firmicutes","Firmicutes w/o Enterococcus",rownames(PhylReads))

## ----ApplyBothApproaches-------------------------------------------------
#Calculate relative abundances:
standard_data<-sweep(PhylReads,2,colSums(PhylReads),'/') 
standard_data<-standard_data*100

#Apply SCML Salini only
SCML_data<-PhylReads
for(i in seq(ncol(PhylReads))){
  SCML_data[,i]<-SCML_data[,i]/sizeFactor_Salini_asct[i]
}

## ----LongTableFormat-----------------------------------------------------
require(reshape)

##Relative Abundance
m.asct_rel<-melt(standard_data)
colnames(m.asct_rel)<-c("Phylum","SampleID","RelativeAbundance")
x<-match(m.asct_rel$SampleID,asct.meta$SampleID)
#Add patient ID:
m.asct_rel$Patient<-asct.meta$Treatment[x]
#Add time point:
m.asct_rel$Time<-factor(asct.meta$Time[x],levels=c("preASCT","d0","d7","d14"))

##SCML
m.asct_scml<-melt(SCML_data)
colnames(m.asct_scml)<-c("Phylum","SampleID","CalibratedCounts")
x<-match(m.asct_scml$SampleID,asct.meta$SampleID)
#Add patient ID:
m.asct_scml$Patient<-asct.meta$Treatment[x]
#Add time point:
m.asct_scml$Time<-factor(asct.meta$Time[x],levels=c("preASCT","d0","d7","d14"))

## ----Figure_5a-----------------------------------------------------------
#Set colors for the bar plots:
asct_colors<-c("#e41a1c","#377eb8","#4daf4a","#984ea3","#ff7f00","#ffff33","#a65628")
#Create Figure 5a:
Figure_5a<-ggplot(data=m.asct_rel,aes(x=Time,y=RelativeAbundance,fill=Phylum))+
  geom_bar(stat="identity")+theme_bw()+
  theme(text=element_text(size=11),
        axis.text.x = element_text(angle = 90, hjust = 1,vjust=.5),
        plot.title = element_text(hjust = 0),
        legend.position = "bottom",
        legend.text=element_text(size=10))+
  facet_wrap(~Patient,ncol = 6,scales = "free_x")+
  ylab("relative abundance")+
  scale_fill_manual(values=asct_colors)+
  ggtitle("a")+
  guides(fill=guide_legend(nrow=2,byrow=TRUE,title=""))

print(Figure_5a)

## ----Figure_5b-----------------------------------------------------------
#Create Figure 5b:
Figure_5b<-ggplot(data=m.asct_scml,aes(x=Time,y=CalibratedCounts,fill=Phylum))+
  geom_bar(stat="identity")+theme_bw()+
  theme(text=element_text(size=11),
        axis.text.x = element_text(angle = 90, hjust = 1,vjust=.5),
        plot.title = element_text(hjust = 0),
        legend.position = "bottom",
        legend.text=element_text(size=10),
        axis.text.y=element_blank())+
  facet_wrap(~Patient,ncol = 6,scales = "free_x")+
  ylab("microbial abundance (scaled)")+
  scale_fill_manual(values=asct_colors)+
  ggtitle("b")+guides(fill=guide_legend(nrow=2,byrow=TRUE,title=""))+
  scale_y_continuous(breaks=NULL)

print(Figure_5b)

## ----SubsettingEnterococcusData------------------------------------------
patientsECpositive<-c("Patient2","Patient4","Patient5")
#Subset the standard data for relative abundances of Enterococcus:
mEC<-subset(m.asct_rel,Phylum=="Genus: Enterococcus")
#Add the calibrated read counts for Enterococcus from the SCML data:
mEC$CalibratedCounts<-subset(m.asct_scml,
                             Phylum=="Genus: Enterococcus")$CalibratedCounts
#Subset only for Enterococcus positive Patients:
mEC<-subset(mEC,Patient%in%patientsECpositive)

## ----CalculateLog2Ratios_Enterococcus------------------------------------
#Calculate log2ratios between the last and first captured time point
#for Patient 2, 4 and 5 based on standard data:
log2ratio_relative<-c(log2(mEC$RelativeAbundance[4]/mEC$RelativeAbundance[1]),
                      log2(mEC$RelativeAbundance[7]/mEC$RelativeAbundance[5]),
                      log2(mEC$RelativeAbundance[10]/mEC$RelativeAbundance[8]))
#Calculate log2ratios between the last and first captured time point
#for Patient 2, 4 and 5 based on SCML data:
log2ratio_scml<-c(log2(mEC$CalibratedCounts[4]/mEC$CalibratedCounts[1]),
                      log2(mEC$CalibratedCounts[7]/mEC$CalibratedCounts[5]),
                      log2(mEC$CalibratedCounts[10]/mEC$CalibratedCounts[8]))

#Create patient vector:
patientVec<-mEC$Patient[c(1,7,10)]
#Create comparison vector:
comparison<-c(paste0(mEC$Time[4],"_vs_",mEC$Time[1]),
              paste0(mEC$Time[7],"_vs_",mEC$Time[5]),
              paste0(mEC$Time[10],"_vs_",mEC$Time[8]))

#Combine to one Table:
log2.ratio<-data.frame(Patient=patientVec,
                       Comparison=comparison,
                       RelativeAbundance=log2ratio_relative,
                       SCML=log2ratio_scml)

m.log2.ratio<-melt(log2.ratio)
colnames(m.log2.ratio)[3:4]<-c("Method","log2ratio")
methodVec<-gsub("RelativeAbundance","relative abundance",m.log2.ratio$Method)
m.log2.ratio$Method<-factor(methodVec,levels=c("relative abundance","SCML"))

## ----Figure_5c-----------------------------------------------------------
Figure_5c<-ggplot(m.log2.ratio,aes(x=Patient,y=log2ratio,fill=Method))+
  geom_bar(stat="identity",position=position_dodge())+
  theme_bw()+xlab("")+
  ylab(expression(paste("Enterococcus ", 
            log[2]," ratio between last and first captured timepoint",
            sep="")))+
  theme(text=element_text(size=13),
        axis.text.x=element_text(angle=90, vjust=0.5, size=10),
        plot.title = element_text(hjust = 0),
        legend.position="bottom",
        legend.text=element_text(size=8))+
  ggtitle("c")+scale_fill_manual(values=c("#bdbdbd", "#636363"),
                                 name="",breaks=c("relative abundance","SCML"))+
  guides(fill=guide_legend(nrow=2,byrow=TRUE))

print(Figure_5c)

## ----SessionInfo---------------------------------------------------------
sessionInfo()