####################################################################
## 02/04/2019
## Codes by Hyejung Won: hyejung_won@med.unc.edu
## Selecting random regions the GC-content of which is matched with HARs
####################################################################
library(GenomicRanges);
library(GenomicFeatures);
library(biomaRt);
library(BSgenome.Hsapiens.UCSC.hg19);
options(stringsAsFactors=FALSE);

##File containing HARs
bedfile = read.table("HAR_coordinate.bed")
colnames(bedfile)[1:3] = c("chrom","chromStart","chromEnd")
HAR = GRanges(bedfile$chrom, IRanges(bedfile$chromStart,bedfile$chromEnd))
HAR = sort(sortSeqlevels(HAR))

##Get GC content
peakseqs = getSeq(Hsapiens,seqnames(HAR),start(HAR),end(HAR));
GCcontent = rowSums(letterFrequency(peakseqs,c("G","C"),as.prob=TRUE));
##Add GC content to each promoter genomic range
mcols(HAR)$GCcontent = GCcontent;
seqlengths(HAR) = seqlengths(Hsapiens)[1:23];

##Loop over each promoter and choose a random interval with the same decile of GC-content
##Within the same chromosome
goodrandproms = GRanges();
##Run the whole sampling 10 times to get enough random intervals for Hi-C
for (j in 1:10) {
  ##Now take many random samplings from the genome of the same size to determine how many have this many differentially open peaks
  for (i in 1:length(seqlevels(HAR))) {
    cat('chr',i,'\n');
    ##this chr promoter
    thischrprom = HAR[seqnames(HAR)==seqlevels(HAR)[i]];
    ##Flag to determine if all promoters within the chromosome have a GC matched random equivalent
    allpromsnotgoodflag = TRUE;
    ##Loop until all promoters within the chromosome have a GC matched random equivalent
    while (allpromsnotgoodflag) {
      cat('in redo loop\n');
      ##Get the number of ranges within this chromosome
      nranges = length(thischrprom);
      ##Get the maximum length of the beginning site on a chromosome
      maxseqlength = seqlengths(HAR)[i]-(max(width(thischrprom)));
      ##Get random interval starting point
      randstart = as.integer(runif(nranges, min=1,max=maxseqlength));
      ##Width of all promoters on this chromosome
      promwidths = width(thischrprom);
      ##Random promoters
      rand.gr = GRanges(seqnames=seqlevels(HAR)[i],IRanges(randstart,width=promwidths));
      ##Get GC content of these promoters
      randpromoterseqs = getSeq(Hsapiens, rand.gr);
      GCcontent = rowSums(letterFrequency(randpromoterseqs,c("G","C"),as.prob=TRUE));
      mcols(rand.gr)$GCcontent = GCcontent;
      ##Find where GC content is within 5% of the GC content of the original
      keepind = which(abs(mcols(rand.gr)$GCcontent - mcols(thischrprom)$GCcontent) <= 0.05);
      redoind = which(abs(mcols(rand.gr)$GCcontent - mcols(thischrprom)$GCcontent) > 0.05);
      ##The random promoters to keep
      if (length(keepind)>0)
        goodrandproms = c(goodrandproms,rand.gr[keepind]);
      ##If there are promoters which don't match GC, re-loop
      if (length(redoind)==0) {
        allpromsnotgoodflag = FALSE;
      } else {
        thischrprom = thischrprom[redoind];
        cat(length(redoind),'\n');
      }
    }
  }
}

goodrandpromsort = sort(goodrandproms)
olap = findOverlaps(goodrandpromsort,HAR)

goodrandpromsortrm = goodrandpromsort[c(-queryHits(olap))]
goodrandproms = goodrandpromsortrm

write.table(cbind(as.character(seqnames(goodrandproms)),start(goodrandproms),end(goodrandproms)),file="RandomGCMatchedHARs.bed",quote=FALSE,row.names=FALSE,col.names=FALSE);    

####################################################################
## 02/04/2019
## Codes by Hyejung Won: hyejung_won@med.unc.edu
## DHS enrichment analysis for HARs
####################################################################
options(stringsAsFactors=F)
library(GenomicRanges)
library(ggplot2)
load("phastCons.hg19.conservedregions.Rdata") # evolutionary conserved regions defined by phastCons score
load("DHS_celltype.rda") # coordinates for DHS in each cell type; saved as chromranges; downloaded from Roadmap Epigenomics: http://www.roadmapepigenomics.org/
bedfile = read.table("HAR_coordinate.bed")
colnames(bedfile)[1:3] = c("chrom","chromStart","chromEnd")

haranges = GRanges(seqnames=bedfile$Chr,ranges=IRanges(as.numeric(bedfile$Start),as.numeric(bedfile$End)))

olap = findOverlaps(haranges,chromranges);
harivalues = haranges[queryHits(olap)];
mcols(harivalues) = cbind(mcols(haranges[queryHits(olap)]), mcols(chromranges[subjectHits(olap)]))

olap = findOverlaps(conserved,chromranges);
conivalues = conserved[queryHits(olap)];
mcols(conivalues) = cbind(mcols(conserved[queryHits(olap)]), mcols(chromranges[subjectHits(olap)]))

chrommark = unique(chromranges$celltype)

pval = c()
enrichment = c()
harnum = length(unique(haranges))
connum = length(unique(conserved))
finaldat = data.frame(pval=rep(NA,length(chrommark)), or=rep(NA,length(chrommark)), ci1=rep(NA,length(chrommark)), ci2=rep(NA,length(chrommark)))
for(i in 1:length(chrommark)){
  harcell = unique(harivalues[harivalues$celltype==chrommark[i]])
  concell = unique(conivalues[conivalues$celltype==chrommark[i]])
  cont.mat = matrix(c(length(harcell), harnum-length(harcell), length(concell), connum-length(concell)), nrow=2) # contingency matrix
  fisherdat = fisher.test(cont.mat)
  
  finaldat[i,"pval"] = fisherdat$p.value
  finaldat[i,"or"] = fisherdat$estimate
  finaldat[i,"ci1"] = fisherdat$conf.int[1]
  finaldat[i,"ci2"] = fisherdat$conf.int[2]
  
  print(i)
}

pvalhisg = data.frame("EpigeneticMarks"=rownames(finaldat), "Pval"=-log10(finaldat$pval))
orhisg = data.frame("EpigeneticMarks"=rownames(finaldat), "OR"=finaldat$or, "ci1"=finaldat$ci1, "ci2"=finaldat$ci2)

pdf(file="HAR_enrichment_to_conserved_elements.pdf", width=15, height=4)
barpl = ggplot(pvalhisg, aes(x=EpigeneticMarks, y=Pval))
barpl + geom_bar(stat="identity",position="dodge", fill="midnightblue") + theme_bw() + labs(y="-log(P-val)", fill="", x="") + 
  theme(panel.grid.major =element_blank(), panel.grid.minor = element_blank(), axis.text.x=element_text(angle=90,hjust=1)) 
barpl = ggplot(orhisg, aes(x=EpigeneticMarks, y=OR))
barpl + geom_bar(stat="identity", position="dodge", fill="midnightblue") + theme_bw() + labs(y="OR", fill="", x="") + 
  geom_errorbar(aes(ymin=ci1, ymax=ci2), width=.2, position=position_dodge(0.9)) + 
  theme(panel.grid.major=element_blank(), panel.grid.minor = element_blank(), axis.text.x=element_text(angle=90,hjust=1)) 
dev.off()

## phastCons score matched genomic regions
library(phastCons100way.UCSC.hg19)

phast = phastCons100way.UCSC.hg19

phasthar = gscores(phast, haranges)
phasthar[phasthar$default==1]$default = 0.999
phasthar$wid = width(phasthar)

phastrange = seq(0,0.9,0.1)

harbg = vector(length=10000, mode="list")
for(j in 1:10000){
  phastdefault = c()
  for(i in 1:length(phastrange)){
    harwirange = phasthar[phasthar$default>=phastrange[i] & phasthar$default<(phastrange[i]+0.1)]
    load(paste0("./phastCons/phastCons.",(i-1),".hg19.conservedregions.Rdata"))
    conserved = conserved[seqnames(conserved) %in% c(paste0("chr", c(1:22,"X")))]
    permconserved = conserved[sample(length(conserved), length(harwirange), replace=F)]
    phastdefault = c(phastdefault, permconserved)
  }
  phastdefault = unlist(GRangesList(phastdefault))
  phastdefault = unique(phastdefault)
  print(length(phastdefault))
  harbg[[j]] = phastdefault
  print(j)
}

pval = c()
enrichment = c()
fisherP = fisherOR = bgratio = data.frame(matrix(NA, nrow=10000, ncol=length(chrommark)))
colnames(fisherP) = colnames(fisherOR) = colnames(bgratio) = chrommark

olap = findOverlaps(haranges,chromranges);
dhsivalues = haranges[queryHits(olap)];
mcols(dhsivalues) = cbind(mcols(haranges[queryHits(olap)]), mcols(chromranges[subjectHits(olap)]))

haratio = c() 
for(i in 1:length(chrommark)){
  harchrom = unique(dhsivalues[dhsivalues$celltype==chrommark[i]])
  haratio = c(haratio, length(harchrom)/length(haranges))
}
names(haratio) = chrommark

for(j in 1:10000){
  hartarget = harbg[[j]]
  olap = findOverlaps(hartarget,chromranges);
  dhsibg = hartarget[queryHits(olap)];
  mcols(dhsibg) = cbind(mcols(hartarget[queryHits(olap)]), mcols(chromranges[subjectHits(olap)]))
  
  for(i in 1:length(chrommark)){
    harchrom = unique(dhsivalues[dhsivalues$celltype==chrommark[i]])
    bgchrom = unique(dhsibg[dhsibg$celltype==chrommark[i]])
    
    bgratio[j,i] = length(bgchrom)/length(hartarget)
    
    fisher2result = fisher.test(matrix(c(length(harchrom), length(haranges), length(bgchrom), length(hartarget)),2,2))
    fisherP[j,i] = fisher2result$p.value
    fisherOR[j,i] = fisher2result$estimate
  }
  
  print(j)
}

enrichmentP = c()
for(i in 1:length(chrommark)){
  bgratio.cell = bgratio[,i]
  enrichmentP = c(enrichmentP, sum(haratio[i] < bgratio.cell)/length(bgratio.cell))
}

orhisg = data.frame("EpigeneticMarks"=chrommark, "OR"=colMeans(fisherOR), "ci1"=colMeans(fisherOR)-apply(fisherOR, 2, sd), "ci2"=colMeans(fisherOR)+apply(fisherOR, 2, sd))
pdf(file="HAR_enrichment_to_conserved_matched_elements.pdf", width=15, height=4)
barpl = ggplot(orhisg, aes(x=EpigeneticMarks, y=OR))
barpl + geom_bar(stat="identity", position="dodge", fill="midnightblue") + theme_bw() + labs(y="OR", fill="", x="") + 
  geom_errorbar(aes(ymin=ci1, ymax=ci2), width=.2, position=position_dodge(0.9)) + 
  theme(panel.grid.major=element_blank(), panel.grid.minor = element_blank(), axis.text.x=element_text(angle=90,hjust=1)) 
dev.off()

####################################################################
## 02/04/2019
## Codes by Hyejung Won: hyejung_won@med.unc.edu
## dN/dS comparison
####################################################################

options(stringsAsFactors=F)
FBfile = "HARgenes.txt" # Hi-C interacting genes for HARs
library(biomaRt)

FB = unlist(read.table(FBfile))

## Protein-coding genes from Gencode v19
getinfo = c("ensembl_gene_id","hgnc_symbol","chromosome_name","start_position","end_position","strand","band","gene_biotype")
mart = useMart(biomart="ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl",host="feb2014.archive.ensembl.org") # Using Gencode v19 annotations
geneAnno1 = getBM(attributes = getinfo,filters=c("chromosome_name"),values=c(seq(1,22,by=1),"X"),mart=mart)
geneAnno1 = geneAnno1[geneAnno1[,"gene_biotype"]=="protein_coding",]; geneAnno1 = geneAnno1[!duplicated(geneAnno1[,"hgnc_symbol"]) & geneAnno1[,"hgnc_symbol"]!="",] ## 19163 with hgnc symbols after removing duplicates. We use gene symbols here because the RDNV data comes in gene symbol format. Otherwise, I prefer to use ENSG IDs.

## Select out protein-coding genes 
FBpc = FB[FB %in% geneAnno1$ensembl_gene_id]
FBpc = geneAnno1[geneAnno1$ensembl_gene_id %in% FB, ]
FBensg = FBpc$ensembl_gene_id
FBhgnc = FBpc$hgnc_symbol

mart.hs = mart
listmarths = listAttributes(mart.hs)
hommart = listmarths[grep("homolog", listmarths$name),1] # homologs mart

allpc = geneAnno1[!(geneAnno1$ensembl_gene_id %in% FB), ]
allensg = allpc$ensembl_gene_id
allhgnc = allpc$hgnc_symbol

FBdnds_mouse = c()
FBdiffdnds = c()
for(i in 1:length(FBensg)){
  dn = getBM(attributes = "mmusculus_homolog_dn", filters = "ensembl_gene_id", values = FBensg[i], mart = mart.hs)
  ds = getBM(attributes = "mmusculus_homolog_ds", filters = "ensembl_gene_id", values = FBensg[i], mart = mart.hs)
  if(dim(dn)[1]!=0 & dim(ds)[1]!=0){
    if(dim(dn)[1]==dim(ds)[1]){
      FBdnds_mouse = rbind(FBdnds_mouse, c(FBensg[i], dn/ds))
      print(i)
    }else{
      FBdiffdnds = c(FBdiffdnds, i)
    }
  }
}

FBDdnds_mouse = c()
FBDdiffdnds = c()
for(i in 1:length(FBDensg)){
  dn = getBM(attributes = "mmusculus_homolog_dn", filters = "ensembl_gene_id", values = FBDensg[i], mart = mart.hs)
  ds = getBM(attributes = "mmusculus_homolog_ds", filters = "ensembl_gene_id", values = FBDensg[i], mart = mart.hs)
  if(dim(dn)[1]!=0 & dim(ds)[1]!=0){
    if(dim(dn)[1]==dim(ds)[1]){
      FBDdnds_mouse = rbind(FBDdnds_mouse, c(FBDensg[i], dn/ds))
      print(i)
    }else{
      FBDdiffdnds = c(FBDdiffdnds, i)
    }
  }
}

alldnds_mouse = c()
alldiffdnds = c()
for(i in 1:length(allensg)){
  dn = getBM(attributes = "mmusculus_homolog_dn", filters = "ensembl_gene_id", values = allensg[i], mart = mart.hs)
  ds = getBM(attributes = "mmusculus_homolog_ds", filters = "ensembl_gene_id", values = allensg[i], mart = mart.hs)
  if(dim(dn)[1]!=0 & dim(ds)[1]!=0){
    if(dim(dn)[1]==dim(ds)[1]){
      alldnds_mouse = rbind(alldnds_mouse, c(allensg[i], dn/ds))
      print(i)
    }else{
      alldiffdnds = c(alldiffdnds, i)
    }
  }
}

FBdnds = as.numeric(unlist(FBdnds_mouse[,2]))
FBdnds = FBdnds[!is.na(FBdnds)]; FBdnds = FBdnds[FBdnds!=0 & FBdnds!="Inf"]

Adnds = as.numeric(unlist(alldnds_mouse[,2]))
Adnds = Adnds[!is.na(Adnds)]; Adnds = Adnds[Adnds!=0 & Adnds!="Inf"]

logFBdnds = log2(FBdnds)
logAdnds = log2(Adnds)

ks.test(logFBdnds, logAdnds) 

densFB = density(log2(Pdnds)) 
densA = density(log2(Adnds))

xlim = range(-10,2); ylim = range(0,densFB$y, densA$y)
Hcol = adjustcolor("darkorange1", 0.6)
Lcol = adjustcolor("dodgerblue1", 0.6)

pdf(file="density_of_dnds_mouse.pdf", width=6, height=2.9)
par(mfrow=c(1,2),mar=c(3,3,1,1))
plot(densFB, xlim=xlim, ylim=ylim, main ='Distribution', xlab="", ylab="",cex.main=1.1, cex.axis=1.0, xaxt="n")
polygon(densFB, density = -1, col = Hcol)
polygon(densA, density = -1, col = Lcol)
axis(1, at=seq(-10,2,by=2), las=1)
mtext(side=1, text ='log2(dN/dS)', line=2, cex=1.1)
mtext(side=2, text ='Density', line=2, cex=1.1)
legend('topleft',c('HAR','All'),
       fill = c(Hcol, Lcol), bty = 'n', border = NA, cex=1.0)
dev.off()

####################################################################
## 02/04/2019
## Codes by Hyejung Won: hyejung_won@med.unc.edu
## Fisher's exact test
####################################################################
options(stringsAsFactors = FALSE)
library(biomaRt)
hargene = unlist(read.table("HARgene.txt")) # evolutionary gene sets in hgnc symbol
diseasegene = unlist(read.table("Diseasegene.txt")) # genes that are associated with a certain disorder
getinfo = c("ensembl_gene_id","hgnc_symbol","chromosome_name","start_position","end_position","strand","band","gene_biotype")
mart = useMart(biomart="ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl",host="feb2014.archive.ensembl.org") # Using Gencode v19 annotations
geneAnno1 = getBM(attributes = getinfo,filters=c("chromosome_name"),values=c(seq(1,22,by=1),"X"),mart=mart)
allgene= geneAnno1$hgnc_symbol
exomeLength = read.table("exomelength.txt", header=T) ## Downloaded from the Supplementary Table of Parikshak et al., 2013

metaMat = exomeLength[match(rownames(metaMat),exomeLength$hgnc),"exomelength"]
rownames(metaMat) = geneAnno1$hgnc_symbol
colnames(metaMat) = "exomeLength"

metaMat = cbind(metaMat, rep(NA, nrow(metaMat)))
listname = "HAR"
colnames(metaMat)[ncol(metaMat)] = listname
matchlist = match(rownames(metaMat), hargene)
metaMat[!is.na(matchlist), listname] = 1
metaMat[is.na(matchlist), listname] = 0 

metaMat = cbind(metaMat, rep(NA, nrow(metaMat)))
listname = "Disease"
colnames(metaMat)[ncol(metaMat)] = listname
matchlist = match(rownames(metaMat), diseasegene)
metaMat[!is.na(matchlist), listname] = 1
metaMat[is.na(matchlist), listname] = 0

glm.out = glm(metaMat[,3]~metaMat[,2]+metaMat[,1],family=binomial) 

####################################################################
## 02/04/2019
## Codes by Hyejung Won: hyejung_won@med.unc.edu
## Human vs. Rhesus expression data
## Files and codes downloaded from: https://github.com/AllenBrainAtlas/DevRhesusLMD.git. 
####################################################################
# Load libraries 
library(reshape2)
library(segmented)
library(ggplot2)
library(scatterplot3d)
library(RColorBrewer)
library(limma)

# Load functions 
source(file="../src/fReorderFactorLevels.R")
source(file="../src/fConvertPcdtoEventScore.R")

# Try loading cached fits
dev.pred.fn = "../cache/dev_expr_species/dev.expr_dev.pred_2hs.RData" 
try(load(dev.pred.fn), silent=TRUE)

# Get genes you would like to test for
evolgene = unlist(read.table("evol_gene.txt"))

# Calc species expression variation across development 
rh.var = apply(dev.pred[["macaque"]], 1, function(x) sd(x) / mean(x)) 
h.var = apply(dev.pred[["human"]], 1, function(x) sd(x) / mean(x))
rat.var = apply(dev.pred[["rat"]], 1, function(x) sd(x) / mean(x))

# Let's check whether the variance differ for the genes that you selected for
checkvar = function(genelist){
  h.var.test = h.var[names(h.var) %in% genelist]
  h.var.ntest = h.var[!(names(h.var) %in% genelist)]
  
  rh.var.test = rh.var[names(rh.var) %in% genelist]
  rh.var.ntest = rh.var[!(names(rh.var) %in% genelist)]
  
  rat.var.test = rat.var[names(rat.var) %in% genelist]
  rat.var.ntest = rat.var[!(names(rat.var) %in% genelist)]
  
  wch = wilcox.test(h.var.test, h.var.ntest)
  wcrh = wilcox.test(rh.var.test, rh.var.ntest)
  wcrat = wilcox.test(rat.var.test, rat.var.ntest)
  
  th = t.test(h.var.test, h.var.ntest)
  trh = t.test(rh.var.test, rh.var.ntest)
  trat = t.test(rat.var.test, rat.var.ntest)
  
  hrhvar.test = log(h.var.test/rh.var.test) 
  hrhvar.ntest = log(h.var.ntest/rh.var.ntest) 
  wchrh = wilcox.test(hrhvar.test, hrhvar.ntest)
  thrh = t.test(hrhvar.test, hrhvar.ntest)
  
  vartestlist = list(wch, wcrh, wcrat, wchrh, th, trh, trat, thrh)
  names(vartestlist) = c("var_human:wilcox", "var_rhesus:wilcox", "var_rat:wilcox", "var_human_vs_rhesus:wilcox", 
                         "var_human:t-test", "var_rhesus:t-test", "var_rat:t-test", "var_human_vs_rhesus:t-test")
  return(vartestlist)
  
}

evol.var = checkvar(evolgene) 

# Let's plot developmental expression trajectories for a gene list 
dev.Expr = dev.expr[dev.expr$species %in% c("human", "macaque"),]
dev.Expr$evol = as.factor(ifelse(dev.Expr$gene %in% evolgene, 1, 2))

## Let's select out the Z-scores in critical epochs
dev.expr.bound = dev.Expr
dev.expr.bound$estage = ifelse(dev.expr.bound$escore<0.5, 0, 1)
dev.expr.bound$estage = dev.expr.bound$estage + ifelse(dev.expr.bound$escore>0.5, 1, 0)
dev.expr.bound$estage = dev.expr.bound$estage + ifelse(dev.expr.bound$escore>1, 1, 0)

g2 = ggplot(dev.expr.bound, aes(x=evol, y=exprz)) + geom_boxplot(aes(fill=species)) + facet_wrap( ~ estage) + 
  theme_bw() +
  theme(panel.grid.minor = element_blank()) +
  scale_color_manual(values = pal1) +
  scale_fill_manual(values = pal1) +
  labs( y="Normalized expression Z-score", 
        x="",
        title = "Fetal Brain Developmental Expression: Evol vs. non-Evol")

# Calculate statistics
dev.expr.bound1 = dev.Expr[dev.Expr$escore>0.5 & dev.Expr$escore<1, ]
dev.expr.bound1$index = paste(dev.expr.bound1$species, dev.expr.bound1$evol, sep=":")
dev.expr.bound2 = dev.Expr[dev.Expr$escore<0.5, ]
dev.expr.bound2$index = paste(dev.expr.bound2$species, dev.expr.bound2$evol, sep=":")

aov1 = aov(exprz~index, data=dev.expr.bound1)
TukeyHSD(aov1)
aov2 = aov(exprz~index, data=dev.expr.bound2)
TukeyHSD(aov2)

g3 = ggplot(dev.expr.bound1, aes(exprz)) + geom_density(aes(fill=factor(index)), alpha=0.5) + 
  scale_fill_manual(values = pal1) +
  labs( y="Density", 
        x="Normalized expression Z-score", 
        title="Evol vs. non-Evol")

# Let's calculate Delta(Z) based on escore
dev.expr.mq = dev.expr[dev.expr$species=="macaque",]
dev.expr.hu = dev.expr[dev.expr$species=="human",]
timepoint.mq = c(0.46,0.51,0.77)
timepoint.hu = c(0.48,0.54,0.76) 
dev.expr.mq = dev.expr.mq[dev.expr.mq$escore %in% timepoint.mq, ]
dev.expr.hu = dev.expr.hu[dev.expr.hu$escore %in% timepoint.hu, ]
dev.mq = cbind(dev.expr.mq, dev.expr.hu)
sum(dev.mq[,2] != dev.mq[,8])
dev.mq$zdiff = dev.mq[,11] - dev.mq[,5]
dev.mq = dev.mq[,c(2,12,13)]
dev.mq$evol = ifelse(dev.mq$gene %in% evolgene, "evol", "nonevol")

g4 = ggplot(dev.mq, aes(zdiff)) + geom_density(aes(fill=factor(evol)), alpha=0.5) + 
  scale_fill_manual(values = pal1) +
  labs( y="Density", 
        x=expression(Delta*" Normalized expression Z-score (Human-Macaque)"), 
        title="Evol vs. non-Evol")

pdf("Evol_expression_traits.pdf", height=5, width=7)
plot(g1)
plot(g2)
plot(g3)
plot(g4)
dev.off()

# Load functions 
source(file="../src/fReorderFactorLevels.R")
source(file="../src/fConvertPcdtoEventScore.R")

# Load genes of interest
load("/proj/hyejunglab/chr/geneAnno_allgenes.rda")
hargene = unlist(read.table("HARgene.txt"))
hugaingene = unlist(read.table("Hugaingene.txt"))
hlegene = unlist(read.table("HLEgene.txt"))
hgegene = unlist(read.table("HGEgene.txt"))

# Load breakpoints for increasing genes in all species/regions
bp.df = read.csv(file = "../cache/dev_expr_species/species_region_bp.csv", 
                  stringsAsFactors = FALSE)  # SI Table 11 subset
bp.df$bpmin = mapply(ConvertPcdtoEventScore, log2(bp.df$bp..pcd. - 1.96 * bp.df$bpse), 
                      bp.df$species)
bp.df$bpmax = mapply(ConvertPcdtoEventScore, log2(bp.df$bp..pcd. + 1.96 * bp.df$bpse), 
                      bp.df$species)
bp.subset = subset(bp.df, species %in% c("macaque", "human")) # & region == "ACG")
bp.escore = dcast(bp.subset, region + gene ~ species, value.var = "bp..event.score.")
bp.min = dcast(bp.subset, region + gene ~ species, value.var = "bpmin")
bp.max = dcast(bp.subset, region + gene ~ species, value.var = "bpmax")
bp.diff = data.frame(bp.escore, bp.min[, 3:4], bp.max[, 3:4],
                      h_m_escore = bp.escore$human - bp.escore$macaque,
                      hmin_mmax = bp.min$human - bp.max$macaque,
                      hmax_mmin = bp.max$human - bp.min$macaque)

bp.diff$sig = apply(bp.diff, 1, function(x) (as.numeric(x["h_m_escore"]) > 0 & 
                                                min(as.numeric(x[c("hmin_mmax", "hmax_mmin")])) > 0) | 
                       (as.numeric(x["h_m_escore"]) < 0 & 
                          max(as.numeric(x[c("hmin_mmax", "hmax_mmin")])) < 0))

### This file now contains breakpoints for a given gene in different speceis
# Compare breakpoints based on used-defined groups
bp.diff$har = ifelse(bp.diff$gene %in% hargene, "HAR", "nHAR")
bp.diff$hugain = ifelse(bp.diff$gene %in% hugaingene, "HGE:FB", "nHGE")
bp.diff$hge = ifelse(bp.diff$gene %in% hgegene, "HGE:AB", "nHGE")
bp.diff$hle = ifelse(bp.diff$gene %in% hlegene, "HLE:AB", "nHLE")

wilcox.test(h_m_escore~har, data=bp.diff) 
wilcox.test(h_m_escore~hugain, data=bp.diff) 
wilcox.test(h_m_escore~hge, data=bp.diff)
wilcox.test(h_m_escore~hle, data=bp.diff)

# Summarize genes that have significantly different breakpoints between human and macaque based on event scores
diff.summary = data.frame()

# Human early genes
for (region1 in unique(bp.subset$region)) {
  genes = subset(bp.diff, region == region1 & sig == TRUE & h_m_escore < 0)$gene
  diff1 = with(subset(bp.subset, region == region1 & species == "human" & gene %in% genes),
                data.frame(set = "human_early", region1,  
                           Plateau = sum(! slope2sig), 
                           Decreasing = sum(slope2sig & slope2 < 0), 
                           Increasing = sum(slope2sig & slope2 > 0)))
  diff.summary = rbind(diff.summary, diff1)
}

humanearlygene = diffdec = diffinc = c()
for(region1 in c("V1","ACG")){
  genes = subset(bp.diff, region==region1 & sig==TRUE & h_m_escore<0)$gene
  humanearlygene = union(humanearlygene, genes)
  diffdec = union(diffdec, subset(bp.subset, region == region1 & species == "human" & gene %in% genes & slope2sig & slope2<0)$gene)
  diffinc = union(diffinc, subset(bp.subset, region == region1 & species == "human" & gene %in% genes & slope2sig & slope2>0)$gene)
}

# Run Fisher's exact test
bgenes = unique(bp.diff$gene) # These genes show distinct changes in expression trajectories, or breakpoints. A total of 179 increasing and 179 decreasing genes met these criteria in all three species

harbg = intersect(hargene, bgenes)
harhuearly = intersect(humanearlygene, hargene)
fisher.test(matrix(c(length(harhuearly), length(setdiff(harbg, harhuearly)), 
                     length(setdiff(humanearlygene, harhuearly)), length(setdiff(bgenes, union(harbg, humanearlygene)))),2,2))

hugainbg = intersect(hugaingene, bgenes)
hugainhuearly = intersect(humanearlygene, hugaingene)
fisher.test(matrix(c(length(hugainhuearly), length(setdiff(hugainbg, hugainhuearly)), 
                     length(setdiff(humanearlygene, hugainhuearly)), length(setdiff(bgenes, union(hugainbg, humanearlygene)))),2,2))

# Human late genes
for (region1 in unique(bp.subset$region)) {
  genes = subset(bp.diff, region == region1 & sig == TRUE & h_m_escore > 0)$gene
  diff1 = with(subset(bp.subset, region == region1 & species == "human" & gene %in% genes),
                data.frame(set = "human_late", region1,  
                           Plateau = sum(! slope2sig), 
                           Decreasing = sum(slope2sig & slope2 < 0), 
                           Increasing = sum(slope2sig & slope2 > 0)))
  diff.summary = rbind(diff.summary, diff1)
}

diff.summaryl = melt(diff.summary, id = c("set", "region1"))
colnames(diff.summaryl)[3:4] = c("slope_after_breakpoint", "num_genes")
num.genes = diff.summaryl[diff.summaryl$set == "human_early", "num_genes"] 
diff.summaryl[diff.summaryl$set == "human_early", "num_genes"] = -num.genes

diff.summaryl$region1 = factor(diff.summaryl$region1, 
                                levels = c("STR", "AM", "HP", "V1", "ACG"))