#import all the gene expression data
#these are output files from seqmonk annotated probe report - log transformed sum of reads in CDS, normaliased to largest datastore and length corrected
#note that the number of numeric columns on the end varies with the datafile

RNA_Glyall<-as.data.frame(read.delim("Rong Mullins YPGly CDS.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric","numeric")))

RNA_Selall<-as.data.frame(read.delim("Arigo selenite CDs.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric")))

RNA_Meiall<-as.data.frame(read.delim("zhu 2015 meiosis.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric")))

RNA_Mixedall<-as.data.frame(read.delim("adhikari 2014 various CDS.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric","numeric","numeric")))

RNA_Nall<-as.data.frame(read.delim("mayhew nitrogen CDS.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric")))

RNA_Cellall<-as.data.frame(read.delim("lin cellubiose CDS.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric")))

RNA_H2O2all<-as.data.frame(read.delim("baker peroxide CDS.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric")))

RNA_Heatall<-as.data.frame(read.delim("wery 2016 heat ss.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric")))

RNA_Fermentall<-as.data.frame(read.delim("Carvalho_Netto_2015 ferment.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric")))

RNA_OurCuall<-as.data.frame(read.delim("hull cu new cds.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric")))

PolII_Tall<-as.data.frame(read.delim("grzechnik polii chip cds.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric","numeric","numeric")))

RNA_37all<-as.data.frame(read.delim("wery 37deg.txt", header=T, colClasses=c("character","character","numeric","numeric","character","character","character","character","character","character","character","numeric","numeric","numeric")))


#where x is a dataset
#this converts the chromosome to numbers and fixes the IDs
RNAseqProcess<-function(x)
{
	#sort the mitochondrial chromosome number so it ends up as 17
	x$Chromosome[x$Chromosome=="mt"]="XVII"
	x$Chromosome<-as.integer(as.roman(x$Chromosome))
	#convert ID column to contain the actual gene name as a useful search string
	x$ID<-strtrim(x$Probe,nchar(x$Probe)-4)
	#restrict to chr VIII for programming
	#x<-x[x$Chromosome==8,]

	return(x)
}

RNA_Gly<-RNAseqProcess(RNA_Glyall)
RNA_Sel<-RNAseqProcess(RNA_Selall)
RNA_Mei<-RNAseqProcess(RNA_Meiall)
RNA_Mixed<-RNAseqProcess(RNA_Mixedall)
RNA_N<-RNAseqProcess(RNA_Nall)
RNA_Cell<-RNAseqProcess(RNA_Cellall)
RNA_H2O2<-RNAseqProcess(RNA_H2O2all)
RNA_Heat<-RNAseqProcess(RNA_Heatall)
RNA_Ferment<-RNAseqProcess(RNA_Fermentall)
RNA_OurCu<-RNAseqProcess(RNA_OurCuall)
PolII_T<-RNAseqProcess(PolII_Tall)
RNA_37<-RNAseqProcess(RNA_37all)

#############################

#now bring in the gene annotation data 
#and make a sub set of upstream regions called promoters

#NOTE in the SGD_features file there is a gene called B", which must be removed before this import works...
#have also fixed this file so some names that were in lists of names are now single
#this is dirty but don't know how to do it otherwise
annotall<-as.data.frame(read.delim("SGD_features.txt",header=T,colClasses=c("character","character","character","character","character","character","character","character","character","numeric","numeric","character","character","character","character","character")))

annot<-annotall
#annot<-annot[annot$chr==8,]

annot$realstart<-pmin(annot$start,annot$end)
annot$realend<-pmax(annot$start,annot$end)

#now analyse promoter overlaps
Promoters<-annot[annot$type=="ORF",]
Promoter.down=-1000
Promoter.up=1

sapply(1:nrow(Promoters), function(x) 
{
if(Promoters$strand[x]=="C")
{
	Promoters$realstart[x]<<-Promoters$realend[x]+Promoter.up
	Promoters$realend[x]<<-Promoters$realend[x]-Promoter.down
}
else
{
	Promoters$realend[x]<<-Promoters$realstart[x]-Promoter.up
	Promoters$realstart[x]<<-Promoters$realstart[x]+Promoter.down
}
})
Promoters[1:10,]

#######################################################################
#work out which promoters overlap yH2A peaks
yH2Apeaks<-as.data.frame(read.delim("wt filtered MACS2 peaks.txt",header=T,colClasses=c("character","numeric","numeric","numeric","numeric","numeric","numeric","numeric","numeric")))

yH2Apeaks$chr<-substr(yH2Apeaks$chr,4,10)
yH2Apeaks$chr<-as.integer(as.roman(yH2Apeaks$chr))

Promoters$yH2Apeak<-sapply(1:nrow(Promoters), function(x) 
{
	nrow(yH2Apeaks[yH2Apeaks$chr==Promoters$chr[x] & yH2Apeaks$end>=Promoters$realstart[x] & yH2Apeaks$start<=Promoters$realend[x],])
})

##################################################################################
#save the processed data 

write.table(Promoters, file="Promoters.txt",sep="\t",row.names=F,col.names=T)

##################################################################################
#load the processed data

Promoters<-as.data.frame(read.delim("Promoters.txt",header=T,colClasses=c("character","character","character","character","character","character","character","character","character","numeric","numeric","character","character","character","character","character","numeric","numeric","numeric")))

#########################################

#here, we will extract expression values for the gene attached to each promoter
#data is the data file
#col1 is the first column to process, col2 is the last. doeas all the ones in between
#extracts the expression data and normalises to the median _+9 (a constant that brings the complete datasets into positive)

PromExp<-function(data,col1,col2)
{
	result<-as.data.frame(array(0,dim=c(nrow(Promoters),col2-col1+1)))
	colnames(result)<-colnames(data[col1:col2])

	sapply(1:nrow(Promoters), function(x)
	{
		if(Promoters$id.1[x] %in% data$ID)
		{
			result[x,1:(col2-col1+1)]<<-data[data$ID==Promoters$id.1[x],col1:col2]
		}
	})
	meds<-apply(result,2,FUN=median)-9
	result<-sweep(result,2,meds)

	return(result)
}

Alldata<-Promoters[,c(4,5,9,10,11,12,17,18,19)]
Alldata<-cbind(Alldata,PromExp(RNA_Gly,13,15))
Alldata<-cbind(Alldata,PromExp(RNA_Sel,13,14))
Alldata<-cbind(Alldata,PromExp(RNA_Mei,13,13))
Alldata<-cbind(Alldata,PromExp(RNA_Mixed,13,16))
Alldata<-cbind(Alldata,PromExp(RNA_N,13,14))
Alldata<-cbind(Alldata,PromExp(RNA_Cell,13,14))
Alldata<-cbind(Alldata,PromExp(RNA_H2O2,13,14))
Alldata<-cbind(Alldata,PromExp(RNA_Heat,13,14))
Alldata<-cbind(Alldata,PromExp(RNA_Ferment,13,19))
Alldata<-cbind(Alldata,PromExp(RNA_OurCu,13,14))
Alldata<-cbind(Alldata,PromExp(PolII_T,13,14))
Alldata<-cbind(Alldata,PromExp(RNA_37,13,14))

write.table(Alldata, file="All promoter expression norm.txt",sep="\t",row.names=F,col.names=T)

WithPeak<-Alldata[Alldata$yH2Apeak>0,]
WithoutPeak<-Alldata[Alldata$yH2Apeak==0,]

write.table(WithPeak, file="Upstream yH2A peak norm.txt",sep="\t",row.names=F,col.names=T)
write.table(WithoutPeak, file="No upstream yH2A peak norm.txt",sep="\t",row.names=F,col.names=T)