#! /usr/bin/Rscript
library(edgeR) #the package
USEDB <- FALSE 
library(reshape)

if (USEDB){
    require('RPostgreSQL')
    drv <- dbDriver("PostgreSQL")
    con <- dbConnect(drv, dbname="smallrna")

    counts <- dbGetQuery(con,"
                         SELECT 
                            substr(id,1,20) AS first20bases,
                            sum(count1) AS count1, 
                            sum(count2) AS count2, 
                            sum(count3) AS count3, 
                            sum(count4) AS count4, 
                            sum(count5) AS count5, 
                            sum(count6) AS count6, 
                            sum(count7) AS count7, 
                            sum(count8) AS count8
                        FROM 
                            seqhomomircount
                        GROUP BY 
                            first20bases
                        ORDER BY 
                            first20bases;")

   # max_sw_score  <- dbGetQuery(con,"
   #                             SELECT 
   #                                 substr(id,1,20) as first20bases,
   #                                 max(bitscore) 
   #                             FROM 
   #                                 mirbasesw 
   #                             GROUP BY 
   #                                 first20bases;
   #                             ")

    writeToDataBase <- function(x,name,conn){
            if(dbExistsTable(conn, name)){
                dbRemoveTable(conn, name)
                dbWriteTable(conn,name,x)
            }else{
                   dbWriteTable(conn,name,x)
            }
    }

    #READ in data
    #counts <- read.table("homology_counts_grouped_on_20_firstbases.txt",header=T,row.names=1)

    write.table(counts,"counts.txt")
}else{
    counts <- read.table("counts.txt")
}

#The entries here are sequence that have occurences over 20
rownames(counts) <- counts$first20bases
counts_long_format <- melt(counts,id="first20bases")
counts <- counts[,paste("count",1:8,sep="")]
colnames(counts) <- paste("s",1:8,sep='')


#Filter out contigs with fewer than 20 reads for all samples
more_than_80 <- DGEList(counts[rowSums(counts)>80,])
pdf("MDS_more_than_80.pdf")
    plotMDS(more_than_80)
dev.off()

counts_cpm3 <- counts[rowSums(cpm(counts)>3)>2,]
cpm3_exp2 <- DGEList(counts_cpm3)
pdf("cpm3_exp2.pdf")#This looks better based on the multidimensional scaling
    plotMDS(cpm3_exp2)
dev.off()


da <- DGEList(cpm3_exp2)

cat("Number of tags used",nrow(da),"\n")
write.table(counts_cpm3,"counts_cpm3.txt",quote=FALSE,col.names=FALSE)


da <- calcNormFactors(da)
design <- model.matrix(~factor(rep(1:4,2))+factor(rep(0:1,each=4)))#design matrix
colnames(design) <- c("mu",paste("T",2:4,sep=""),"morph")#Name the colums

#Estimate dispersion,
da <- estimateGLMCommonDisp(da,design,method="deviance", robust=TRUE,subset=NULL)
da <- estimateGLMTrendedDisp(da,design)
da <- estimateGLMTagwiseDisp(da,design)


#Fitting
fit <- glmFit(da,design)

#Testing
lrtM <- glmLRT(fit)               #Drop the morph term
lrtT <- glmLRT(fit,coef=2:4)      #Drop the time term

pdf("smear_plot_morph.pdf")
    plotSmear(lrtM)
dev.off()

write.csv(topTags(lrtM,100),file="first20bases_morphDEC.csv")
write.csv(topTags(lrtT,100),file="first20bases_timeDEC.csv")

if (USEDB){
    writeToDataBase(as.data.frame(topTags(lrtM,100)),"morphdec",con)
    writeToDataBase(as.data.frame(topTags(lrtT,100)),"timedec",con)
}