# ploting soliD reads quality # read quality value distribution for sequencing reads SoliD # author : Yann Audic # 2015 # not meant to be run as is ############# SOLiD quality value ############## # The QV value is calculated using a phred like score # q = -10 × log10 (p) # where q is the quality value and (p) is the # predicted probability that the color call is incorrect ################################################ # library required library("bigmemory") # DIRECTORY CLIPseq INDIR <- "/private/staff/c/ic/yaudic/Documents/SCIENCE/CLIPCELF1/CLIPSEQ_input" subdir <- c("AIA0BOLS_reads","AIA0BOLS_20091023_FC1","AIA0BOLS_20091023_FC2","AIA0AOLS_reads","AIA0AOLS_20091023_FC1","AIA0AOLS_20091023_FC2") # defining the files required allfiles <- dir(file.path(INDIR,subdir),full.name=TRUE) # select quality files qual_files <- allfiles [grep("QV.qual$",allfiles)] # RNASEQ qual files RNAseq <- "/private/staff/c/ic/yaudic/Documents/SCIENCE/CLIPCELF1/RNAseq_input/NICKEL_20100706_FC2/NICKEL_20100706_FC2_AIA_COLS1_F3_QV.qual" qual_files <- c(qual_files,RNAseq) ## function definition ##convert_comment # function to replace the > of the quality file by the comment character # this will allow the subsequent input of the qual file # the return value is the full path of the modified file # the transitory file outfile1 is removed convert_comment<-function(infile) { x<-infile outfile1<-paste(infile,".out.qual",sep="") # using sed to replace > by # sed 's/>/#/' *.qual > *.out.qual exe <- paste("sed 's/>/#/' ",infile,sep="") exe <- paste(exe, outfile1, sep= " > ") # convert > to # system(exe) outfile2 <- paste(infile,".trimmed.qual",sep="") # using sed to delete comment lines # sed 's/>/#/' *.out.qual > *.trimmed.qual exe <- paste("sed '/#/d' ",outfile1,sep="") exe <- paste(exe, outfile2, sep= " > ") # delete all comment lines system(exe) file.remove(outfile1) return <- outfile2 } # usage value=convert_comment(infile) ## end convert_comment ## big.bxplot.stats # function to compute boxplot stats from a bigfile # we can calculate the boxplot stats for each column with # boxplot.stats(x, coef = 1.5, do.conf = TRUE, do.out = TRUE) # to plot a boxplot we need the stats and the n big.bxplot.stats<-function(data_bxp) { # number of colum for boxplot num_col <- dim(data_bxp)[2] # variable definition bxp.stat <- list() stats.mat <- matrix(nrow=5,ncol=num_col) number <- c() for (i in c(1:num_col)) { bxp.stat[[i]] <- boxplot.stats(w[,i], coef = 1.5, do.conf = TRUE, do.out = TRUE) stats.mat[,i] <- bxp.stat[[i]]$stats number <- c(number,bxp.stat[[i]]$n) } # we return only the values required to generate boxplot with bxp return <- list(stats=stats.mat, n=number) } ## end big.bxplot.stats ############ ANALYSIS ########### # file backing is required for such a big file w=read.big.matrix(paste(qual_files[2],".trimmed.qual",sep=""),col.names = paste("N",c(1:50),sep=" "),sep=" ",type ="integer",backingfile="W",backingpath="/private/staff/c/ic/yaudic/temp") statistics <- list() length(statistics) <- 7 # 7 files to process for (i in c(1:7)) { rm(w) name<-convert_comment(qual_files[i]) # take some time for big files (7.5 Go) # name=paste(qual_files[1],".trimmed.qual",sep="") w <-read.big.matrix(name,col.names = paste("N",c(1:50),sep=" "),sep=" ",type ="integer",backingfile=paste("W",i,sep="_"),backingpath="/private/staff/c/ic/yaudic/temp") # take some time also statistics[[i]]<-big.bxplot.stats(w) } # create boxplot for (i in c(1:7)) { svg(filename=file.path(OUTDIR,paste(i,"quality.svg",sep="_"))) bxp(statistics[[i]]) dev.off() } ######### END ANALYSIS ######### ######### ENVIRONMENT ########## #R version 3.2.0 (2015-04-16) #Platform: x86_64-suse-linux-gnu (64-bit) #Running under: openSUSE 13.1 (Bottle) (x86_64)# #locale: # [1] LC_CTYPE=fr_FR.UTF-8 LC_NUMERIC=C # [3] LC_TIME=fr_FR.UTF-8 LC_COLLATE=fr_FR.UTF-8 # [5] LC_MONETARY=fr_FR.UTF-8 LC_MESSAGES=fr_FR.UTF-8 # [7] LC_PAPER=fr_FR.UTF-8 LC_NAME=C # [9] LC_ADDRESS=C LC_TELEPHONE=C #[11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C # #attached base packages: #[1] stats graphics grDevices utils datasets methods base #other attached packages: #[1] bigmemory_4.5.8 bigmemory.sri_0.1.3 #loaded via a namespace (and not attached): #[1] Rcpp_0.12.1