#import all the gene expression data #these are output files from seqmonk annotated probe report - log transformed sum of reads in CDS, normaliased to largest datastore and length corrected #note that the number of numeric columns on the end varies with the datafile RNA_Glyall<-as.data.frame(read.delim("Rong Mullins YPGly CDS.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric","numeric"))) RNA_Selall<-as.data.frame(read.delim("Arigo selenite CDs.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric"))) RNA_Meiall<-as.data.frame(read.delim("zhu 2015 meiosis.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric"))) RNA_Mixedall<-as.data.frame(read.delim("adhikari 2014 various CDS.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric","numeric","numeric"))) RNA_Nall<-as.data.frame(read.delim("mayhew nitrogen CDS.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric"))) RNA_Cellall<-as.data.frame(read.delim("lin cellubiose CDS.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric"))) RNA_H2O2all<-as.data.frame(read.delim("baker peroxide CDS.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric"))) RNA_Heatall<-as.data.frame(read.delim("wery 2016 heat ss.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric"))) RNA_Fermentall<-as.data.frame(read.delim("Carvalho_Netto_2015 ferment.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric"))) RNA_OurCuall<-as.data.frame(read.delim("hull cu new cds.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric"))) PolII_Tall<-as.data.frame(read.delim("grzechnik polii chip cds.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric","numeric","numeric"))) RNA_37all<-as.data.frame(read.delim("wery 37deg.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric"))) #where x is a dataset #this converts the chromosome to numbers and fixes the IDs RNAseqProcess<-function(x) { #sort the mitochondrial chromosome number so it ends up as 17 x$Chromosome[x$Chromosome=="mt"]="XVII" x$Chromosome<-as.integer(as.roman(x$Chromosome)) #convert ID column to contain the actual gene name as a useful search string x$ID<-strtrim(x$Probe,nchar(x$Probe)-4) #restrict to chr VIII for programming #x<-x[x$Chromosome==8,] return(x) } RNA_Gly<-RNAseqProcess(RNA_Glyall) RNA_Sel<-RNAseqProcess(RNA_Selall) RNA_Mei<-RNAseqProcess(RNA_Meiall) RNA_Mixed<-RNAseqProcess(RNA_Mixedall) RNA_N<-RNAseqProcess(RNA_Nall) RNA_Cell<-RNAseqProcess(RNA_Cellall) RNA_H2O2<-RNAseqProcess(RNA_H2O2all) RNA_Heat<-RNAseqProcess(RNA_Heatall) RNA_Ferment<-RNAseqProcess(RNA_Fermentall) RNA_OurCu<-RNAseqProcess(RNA_OurCuall) PolII_T<-RNAseqProcess(PolII_Tall) RNA_37<-RNAseqProcess(RNA_37all) ############################# #now bring in the gene annotation data #and make a sub set of upstream regions called promoters #NOTE in the SGD_features file there is a gene called B", which must be removed before this import works... #have also fixed this file so some names that were in lists of names are now single #this is dirty but don't know how to do it otherwise annotall<-as.data.frame(read.delim("SGD_features.txt",header=T,colClasses=c("character","character","character","character","character","character","character","character","character","numeric","numeric","character","character","character","character","character"))) annot<-annotall #annot<-annot[annot$chr==8,] annot$realstart<-pmin(annot$start,annot$end) annot$realend<-pmax(annot$start,annot$end) #now analyse promoter overlaps Promoters<-annot[annot$type=="ORF",] Promoter.down=-1000 Promoter.up=1 sapply(1:nrow(Promoters), function(x) { if(Promoters$strand[x]=="C") { Promoters$realstart[x]<<-Promoters$realend[x]+Promoter.up Promoters$realend[x]<<-Promoters$realend[x]-Promoter.down } else { Promoters$realend[x]<<-Promoters$realstart[x]-Promoter.up Promoters$realstart[x]<<-Promoters$realstart[x]+Promoter.down } }) Promoters[1:10,] ####################################################################### #work out which promoters overlap yH2A peaks yH2Apeaks<-as.data.frame(read.delim("wt filtered MACS2 peaks.txt",header=T,colClasses=c("character","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric"))) yH2Apeaks$chr<-substr(yH2Apeaks$chr,4,10) yH2Apeaks$chr<-as.integer(as.roman(yH2Apeaks$chr)) Promoters$yH2Apeak<-sapply(1:nrow(Promoters), function(x) { nrow(yH2Apeaks[yH2Apeaks$chr==Promoters$chr[x] & yH2Apeaks$end>=Promoters$realstart[x] & yH2Apeaks$start<=Promoters$realend[x],]) }) ################################################################################## #save the processed data write.table(Promoters, file="Promoters.txt",sep="\t",row.names=F,col.names=T) ################################################################################## #load the processed data Promoters<-as.data.frame(read.delim("Promoters.txt",header=T,colClasses=c("character","character","character","character","character","character","character","character","character","numeric","numeric","character","character","character","character","character","numeric","numeric","numeric"))) ######################################### #here, we will extract expression values for the gene attached to each promoter #data is the data file #col1 is the first column to process, col2 is the last. doeas all the ones in between #extracts the expression data and normalises to the median _+9 (a constant that brings the complete datasets into positive) PromExp<-function(data,col1,col2) { result<-as.data.frame(array(0,dim=c(nrow(Promoters),col2-col1+1))) colnames(result)<-colnames(data[col1:col2]) sapply(1:nrow(Promoters), function(x) { if(Promoters$id.1[x] %in% data$ID) { result[x,1:(col2-col1+1)]<<-data[data$ID==Promoters$id.1[x],col1:col2] } }) meds<-apply(result,2,FUN=median)-9 result<-sweep(result,2,meds) return(result) } Alldata<-Promoters[,c(4,5,9,10,11,12,17,18,19)] Alldata<-cbind(Alldata,PromExp(RNA_Gly,13,15)) Alldata<-cbind(Alldata,PromExp(RNA_Sel,13,14)) Alldata<-cbind(Alldata,PromExp(RNA_Mei,13,13)) Alldata<-cbind(Alldata,PromExp(RNA_Mixed,13,16)) Alldata<-cbind(Alldata,PromExp(RNA_N,13,14)) Alldata<-cbind(Alldata,PromExp(RNA_Cell,13,14)) Alldata<-cbind(Alldata,PromExp(RNA_H2O2,13,14)) Alldata<-cbind(Alldata,PromExp(RNA_Heat,13,14)) Alldata<-cbind(Alldata,PromExp(RNA_Ferment,13,19)) Alldata<-cbind(Alldata,PromExp(RNA_OurCu,13,14)) Alldata<-cbind(Alldata,PromExp(PolII_T,13,14)) Alldata<-cbind(Alldata,PromExp(RNA_37,13,14)) write.table(Alldata, file="All promoter expression norm.txt",sep="\t",row.names=F,col.names=T) WithPeak<-Alldata[Alldata$yH2Apeak>0,] WithoutPeak<-Alldata[Alldata$yH2Apeak==0,] write.table(WithPeak, file="Upstream yH2A peak norm.txt",sep="\t",row.names=F,col.names=T) write.table(WithoutPeak, file="No upstream yH2A peak norm.txt",sep="\t",row.names=F,col.names=T)