#! /usr/bin/Rscript library(edgeR) #the package USEDB <- FALSE library(reshape) if (USEDB){ require('RPostgreSQL') drv <- dbDriver("PostgreSQL") con <- dbConnect(drv, dbname="smallrna") counts <- dbGetQuery(con," SELECT substr(id,1,20) AS first20bases, sum(count1) AS count1, sum(count2) AS count2, sum(count3) AS count3, sum(count4) AS count4, sum(count5) AS count5, sum(count6) AS count6, sum(count7) AS count7, sum(count8) AS count8 FROM seqhomomircount GROUP BY first20bases ORDER BY first20bases;") # max_sw_score <- dbGetQuery(con," # SELECT # substr(id,1,20) as first20bases, # max(bitscore) # FROM # mirbasesw # GROUP BY # first20bases; # ") writeToDataBase <- function(x,name,conn){ if(dbExistsTable(conn, name)){ dbRemoveTable(conn, name) dbWriteTable(conn,name,x) }else{ dbWriteTable(conn,name,x) } } #READ in data #counts <- read.table("homology_counts_grouped_on_20_firstbases.txt",header=T,row.names=1) write.table(counts,"counts.txt") }else{ counts <- read.table("counts.txt") } #The entries here are sequence that have occurences over 20 rownames(counts) <- counts$first20bases counts_long_format <- melt(counts,id="first20bases") counts <- counts[,paste("count",1:8,sep="")] colnames(counts) <- paste("s",1:8,sep='') #Filter out contigs with fewer than 20 reads for all samples more_than_80 <- DGEList(counts[rowSums(counts)>80,]) pdf("MDS_more_than_80.pdf") plotMDS(more_than_80) dev.off() counts_cpm3 <- counts[rowSums(cpm(counts)>3)>2,] cpm3_exp2 <- DGEList(counts_cpm3) pdf("cpm3_exp2.pdf")#This looks better based on the multidimensional scaling plotMDS(cpm3_exp2) dev.off() da <- DGEList(cpm3_exp2) cat("Number of tags used",nrow(da),"\n") write.table(counts_cpm3,"counts_cpm3.txt",quote=FALSE,col.names=FALSE) da <- calcNormFactors(da) design <- model.matrix(~factor(rep(1:4,2))+factor(rep(0:1,each=4)))#design matrix colnames(design) <- c("mu",paste("T",2:4,sep=""),"morph")#Name the colums #Estimate dispersion, da <- estimateGLMCommonDisp(da,design,method="deviance", robust=TRUE,subset=NULL) da <- estimateGLMTrendedDisp(da,design) da <- estimateGLMTagwiseDisp(da,design) #Fitting fit <- glmFit(da,design) #Testing lrtM <- glmLRT(fit) #Drop the morph term lrtT <- glmLRT(fit,coef=2:4) #Drop the time term pdf("smear_plot_morph.pdf") plotSmear(lrtM) dev.off() write.csv(topTags(lrtM,100),file="first20bases_morphDEC.csv") write.csv(topTags(lrtT,100),file="first20bases_timeDEC.csv") if (USEDB){ writeToDataBase(as.data.frame(topTags(lrtM,100)),"morphdec",con) writeToDataBase(as.data.frame(topTags(lrtT,100)),"timedec",con) }