# Chavan R. et al 2015
# This script is the simplified version used to perform the estimation of differential expression genes referring to RNA-Seq experiment.
# The analysis was performed in RStudio using Version 0.98.1083
# R version 3.1.1 (2014-07-10)
# Platform: x86_64-apple-darwin13.1.0 (64-bit)

########################################################
#
# Packages and Version information
#
########################################################

library("DESeq2") #Version 1.6.2
library("vsn") #Version 3.34.0
library("RColorBrewer") #Version 1.1-2
library("gplots") #Version 2.16.0
library("genefilter") #Version 1.48.1
library("Biobase") #Version 2.26.0
library("limma") #Version 3.22.4
library("car") #Version 2.0-24

########################################################
#
# Read files from HTseq-count output
#
########################################################

sampleFiles <- list.files(path="../Samples")
sampleCondition=factor(c(rep("C1",3), rep("C2",3)), levels=c("C2","C1"))
sampleTable=data.frame(sampleName=sampleFiles, fileName=sampleFiles,condition=sampleCondition)
directory <- c("../Samples")
des <- formula(~ condition)
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory, design = des)

# Normalization of the samples
ddsHTSeq <- estimateSizeFactors(ddsHTSeq)

# Estimate biological variance
ddsHTSeq <- estimateDispersions(ddsHTSeq)

# First preview of the dispersion
plotDispEsts(ddsHTSeq, ylim = c(1e-6, 1e1))

# Estimate differential expression between different conditions
ddsHTSeq <- nbinomWaldTest(ddsHTSeq)

# Results for p-adjusted value
resultsNames(ddsHTSeq)
res <- results(ddsHTSeq)
res <- res[order(res$padj),]
head(res)

# MA plot
plotMA(ddsHTSeq,ylim=c(-3,3),main="Differential expression analysis between condictions")
abline(h=c(-1.5,1.5),col="dodgerblue",lwd=2)
sum(res$padj < .05, na.rm=TRUE)
#mcols(res, use.names=TRUE)
resSig <- res[ which(res$padj < .05), ]
write.table(resSig, ".../Sifnificant_genes_C1_C2.csv", col.names = NA, quote=FALSE, sep="\t")

########################################################
#
# Estimation of the influence of a data point
#
########################################################

W <- res$stat
maxCooks <- apply(assays(ddsHTSeq)[["cooks"]],1,max)
idx <- !is.na(W)
plot(rank(W[idx]), maxCooks[idx], xlab="Rank W.S",
     ylab="Distance per gene",
     ylim=c(0,5), cex=.4, col=rgb(0,0,0,.3))
abline(h=1, col="red")

########################################################
#
# Standard deviation across all samples
#
########################################################

notAllZero <- (rowSums(counts(ddsHTSeq))>0)
meanSdPlot(log2(counts(ddsHTSeq,normalized=TRUE)[notAllZero,] + 1), ylim = c(0,2.5))
meanSdPlot(assay(rld[notAllZero,]), ylim = c(0,2.5))
meanSdPlot(assay(vsd[notAllZero,]), ylim = c(0,2.5))

########################################################
#
# Plot distance of samples
#
########################################################

distance <- dist(t(assay(rld)))
Dist_perf <- as.matrix(distsRL)
rownames(Dist_perf) <- colnames(Dist_perf) <- with(colData(ddsHTSeq), paste(condition,sampleFiles , sep=" : "))
heatmap.2(Dist_perf, trace="none", col = rev(hmcol), margin=c(16, 16))
print(plotPCA(rld, intgroup=c("condition")))

########################################################
#
# Heatmap produced from the analysis
#
########################################################

# Plot between 2 condition of the interest
d1=read.csv("C1_Paj.csv",header=T, sep="\t")
d2=read.csv("C2_Paj.csv",header=T, sep="\t")

# Merge data
data=merge(d1,d2,by="X",incomparables=NA,all=TRUE)
rn=rownames(data)
unique(rn)
rn=data[,1]
colnames(data)=c("Gene","C1","C2")
data2=sapply(data,function(x) if (is.factor(x)) { as.numeric(as.character(x))}else{x})
rownames(data2)=data[,1]
data2=data2[,2:3]
write.table(data2,"./order.csv",sep="\t")
hm <- heatmap.2(data2, scale="col", Rowv=F, Colv=F, symkey=FALSE,
                margins=c(8,8), cexRow=0.7, cexCol=1.0, key=TRUE, keysize=1.5,
                trace="none",density.info=c("density"),tracecol="blue",col=redgreen(100), main="C1 vs
C2")

# Example of the selection
slist=data[1350:1370,c(2,3)]
slist2=sapply(slist,function(x) if (is.factor(x)) { as.numeric(as.character(x))}else{x})
rownames(slist2)=data[1350:1370,1]
hm <- heatmap.2(slist2, scale="col", Rowv=F, Colv=F, symkey=FALSE,
                margins=c(8,8), cexRow=0.7, cexCol=1.0, key=TRUE, keysize=1.5,
                trace="none",density.info=c("density"),tracecol="blue",col=redgreen(100),
                main="FoldChange C1 vs. C2")