#################################################################### ## 02/04/2019 ## Codes by Hyejung Won: hyejung_won@med.unc.edu ## Selecting random regions the GC-content of which is matched with HARs #################################################################### library(GenomicRanges); library(GenomicFeatures); library(biomaRt); library(BSgenome.Hsapiens.UCSC.hg19); options(stringsAsFactors=FALSE); ##File containing HARs bedfile = read.table("HAR_coordinate.bed") colnames(bedfile)[1:3] = c("chrom","chromStart","chromEnd") HAR = GRanges(bedfile$chrom, IRanges(bedfile$chromStart,bedfile$chromEnd)) HAR = sort(sortSeqlevels(HAR)) ##Get GC content peakseqs = getSeq(Hsapiens,seqnames(HAR),start(HAR),end(HAR)); GCcontent = rowSums(letterFrequency(peakseqs,c("G","C"),as.prob=TRUE)); ##Add GC content to each promoter genomic range mcols(HAR)$GCcontent = GCcontent; seqlengths(HAR) = seqlengths(Hsapiens)[1:23]; ##Loop over each promoter and choose a random interval with the same decile of GC-content ##Within the same chromosome goodrandproms = GRanges(); ##Run the whole sampling 10 times to get enough random intervals for Hi-C for (j in 1:10) { ##Now take many random samplings from the genome of the same size to determine how many have this many differentially open peaks for (i in 1:length(seqlevels(HAR))) { cat('chr',i,'\n'); ##this chr promoter thischrprom = HAR[seqnames(HAR)==seqlevels(HAR)[i]]; ##Flag to determine if all promoters within the chromosome have a GC matched random equivalent allpromsnotgoodflag = TRUE; ##Loop until all promoters within the chromosome have a GC matched random equivalent while (allpromsnotgoodflag) { cat('in redo loop\n'); ##Get the number of ranges within this chromosome nranges = length(thischrprom); ##Get the maximum length of the beginning site on a chromosome maxseqlength = seqlengths(HAR)[i]-(max(width(thischrprom))); ##Get random interval starting point randstart = as.integer(runif(nranges, min=1,max=maxseqlength)); ##Width of all promoters on this chromosome promwidths = width(thischrprom); ##Random promoters rand.gr = GRanges(seqnames=seqlevels(HAR)[i],IRanges(randstart,width=promwidths)); ##Get GC content of these promoters randpromoterseqs = getSeq(Hsapiens, rand.gr); GCcontent = rowSums(letterFrequency(randpromoterseqs,c("G","C"),as.prob=TRUE)); mcols(rand.gr)$GCcontent = GCcontent; ##Find where GC content is within 5% of the GC content of the original keepind = which(abs(mcols(rand.gr)$GCcontent - mcols(thischrprom)$GCcontent) <= 0.05); redoind = which(abs(mcols(rand.gr)$GCcontent - mcols(thischrprom)$GCcontent) > 0.05); ##The random promoters to keep if (length(keepind)>0) goodrandproms = c(goodrandproms,rand.gr[keepind]); ##If there are promoters which don't match GC, re-loop if (length(redoind)==0) { allpromsnotgoodflag = FALSE; } else { thischrprom = thischrprom[redoind]; cat(length(redoind),'\n'); } } } } goodrandpromsort = sort(goodrandproms) olap = findOverlaps(goodrandpromsort,HAR) goodrandpromsortrm = goodrandpromsort[c(-queryHits(olap))] goodrandproms = goodrandpromsortrm write.table(cbind(as.character(seqnames(goodrandproms)),start(goodrandproms),end(goodrandproms)),file="RandomGCMatchedHARs.bed",quote=FALSE,row.names=FALSE,col.names=FALSE); #################################################################### ## 02/04/2019 ## Codes by Hyejung Won: hyejung_won@med.unc.edu ## DHS enrichment analysis for HARs #################################################################### options(stringsAsFactors=F) library(GenomicRanges) library(ggplot2) load("phastCons.hg19.conservedregions.Rdata") # evolutionary conserved regions defined by phastCons score load("DHS_celltype.rda") # coordinates for DHS in each cell type; saved as chromranges; downloaded from Roadmap Epigenomics: http://www.roadmapepigenomics.org/ bedfile = read.table("HAR_coordinate.bed") colnames(bedfile)[1:3] = c("chrom","chromStart","chromEnd") haranges = GRanges(seqnames=bedfile$Chr,ranges=IRanges(as.numeric(bedfile$Start),as.numeric(bedfile$End))) olap = findOverlaps(haranges,chromranges); harivalues = haranges[queryHits(olap)]; mcols(harivalues) = cbind(mcols(haranges[queryHits(olap)]), mcols(chromranges[subjectHits(olap)])) olap = findOverlaps(conserved,chromranges); conivalues = conserved[queryHits(olap)]; mcols(conivalues) = cbind(mcols(conserved[queryHits(olap)]), mcols(chromranges[subjectHits(olap)])) chrommark = unique(chromranges$celltype) pval = c() enrichment = c() harnum = length(unique(haranges)) connum = length(unique(conserved)) finaldat = data.frame(pval=rep(NA,length(chrommark)), or=rep(NA,length(chrommark)), ci1=rep(NA,length(chrommark)), ci2=rep(NA,length(chrommark))) for(i in 1:length(chrommark)){ harcell = unique(harivalues[harivalues$celltype==chrommark[i]]) concell = unique(conivalues[conivalues$celltype==chrommark[i]]) cont.mat = matrix(c(length(harcell), harnum-length(harcell), length(concell), connum-length(concell)), nrow=2) # contingency matrix fisherdat = fisher.test(cont.mat) finaldat[i,"pval"] = fisherdat$p.value finaldat[i,"or"] = fisherdat$estimate finaldat[i,"ci1"] = fisherdat$conf.int[1] finaldat[i,"ci2"] = fisherdat$conf.int[2] print(i) } pvalhisg = data.frame("EpigeneticMarks"=rownames(finaldat), "Pval"=-log10(finaldat$pval)) orhisg = data.frame("EpigeneticMarks"=rownames(finaldat), "OR"=finaldat$or, "ci1"=finaldat$ci1, "ci2"=finaldat$ci2) pdf(file="HAR_enrichment_to_conserved_elements.pdf", width=15, height=4) barpl = ggplot(pvalhisg, aes(x=EpigeneticMarks, y=Pval)) barpl + geom_bar(stat="identity",position="dodge", fill="midnightblue") + theme_bw() + labs(y="-log(P-val)", fill="", x="") + theme(panel.grid.major =element_blank(), panel.grid.minor = element_blank(), axis.text.x=element_text(angle=90,hjust=1)) barpl = ggplot(orhisg, aes(x=EpigeneticMarks, y=OR)) barpl + geom_bar(stat="identity", position="dodge", fill="midnightblue") + theme_bw() + labs(y="OR", fill="", x="") + geom_errorbar(aes(ymin=ci1, ymax=ci2), width=.2, position=position_dodge(0.9)) + theme(panel.grid.major=element_blank(), panel.grid.minor = element_blank(), axis.text.x=element_text(angle=90,hjust=1)) dev.off() ## phastCons score matched genomic regions library(phastCons100way.UCSC.hg19) phast = phastCons100way.UCSC.hg19 phasthar = gscores(phast, haranges) phasthar[phasthar$default==1]$default = 0.999 phasthar$wid = width(phasthar) phastrange = seq(0,0.9,0.1) harbg = vector(length=10000, mode="list") for(j in 1:10000){ phastdefault = c() for(i in 1:length(phastrange)){ harwirange = phasthar[phasthar$default>=phastrange[i] & phasthar$default<(phastrange[i]+0.1)] load(paste0("./phastCons/phastCons.",(i-1),".hg19.conservedregions.Rdata")) conserved = conserved[seqnames(conserved) %in% c(paste0("chr", c(1:22,"X")))] permconserved = conserved[sample(length(conserved), length(harwirange), replace=F)] phastdefault = c(phastdefault, permconserved) } phastdefault = unlist(GRangesList(phastdefault)) phastdefault = unique(phastdefault) print(length(phastdefault)) harbg[[j]] = phastdefault print(j) } pval = c() enrichment = c() fisherP = fisherOR = bgratio = data.frame(matrix(NA, nrow=10000, ncol=length(chrommark))) colnames(fisherP) = colnames(fisherOR) = colnames(bgratio) = chrommark olap = findOverlaps(haranges,chromranges); dhsivalues = haranges[queryHits(olap)]; mcols(dhsivalues) = cbind(mcols(haranges[queryHits(olap)]), mcols(chromranges[subjectHits(olap)])) haratio = c() for(i in 1:length(chrommark)){ harchrom = unique(dhsivalues[dhsivalues$celltype==chrommark[i]]) haratio = c(haratio, length(harchrom)/length(haranges)) } names(haratio) = chrommark for(j in 1:10000){ hartarget = harbg[[j]] olap = findOverlaps(hartarget,chromranges); dhsibg = hartarget[queryHits(olap)]; mcols(dhsibg) = cbind(mcols(hartarget[queryHits(olap)]), mcols(chromranges[subjectHits(olap)])) for(i in 1:length(chrommark)){ harchrom = unique(dhsivalues[dhsivalues$celltype==chrommark[i]]) bgchrom = unique(dhsibg[dhsibg$celltype==chrommark[i]]) bgratio[j,i] = length(bgchrom)/length(hartarget) fisher2result = fisher.test(matrix(c(length(harchrom), length(haranges), length(bgchrom), length(hartarget)),2,2)) fisherP[j,i] = fisher2result$p.value fisherOR[j,i] = fisher2result$estimate } print(j) } enrichmentP = c() for(i in 1:length(chrommark)){ bgratio.cell = bgratio[,i] enrichmentP = c(enrichmentP, sum(haratio[i] < bgratio.cell)/length(bgratio.cell)) } orhisg = data.frame("EpigeneticMarks"=chrommark, "OR"=colMeans(fisherOR), "ci1"=colMeans(fisherOR)-apply(fisherOR, 2, sd), "ci2"=colMeans(fisherOR)+apply(fisherOR, 2, sd)) pdf(file="HAR_enrichment_to_conserved_matched_elements.pdf", width=15, height=4) barpl = ggplot(orhisg, aes(x=EpigeneticMarks, y=OR)) barpl + geom_bar(stat="identity", position="dodge", fill="midnightblue") + theme_bw() + labs(y="OR", fill="", x="") + geom_errorbar(aes(ymin=ci1, ymax=ci2), width=.2, position=position_dodge(0.9)) + theme(panel.grid.major=element_blank(), panel.grid.minor = element_blank(), axis.text.x=element_text(angle=90,hjust=1)) dev.off() #################################################################### ## 02/04/2019 ## Codes by Hyejung Won: hyejung_won@med.unc.edu ## dN/dS comparison #################################################################### options(stringsAsFactors=F) FBfile = "HARgenes.txt" # Hi-C interacting genes for HARs library(biomaRt) FB = unlist(read.table(FBfile)) ## Protein-coding genes from Gencode v19 getinfo = c("ensembl_gene_id","hgnc_symbol","chromosome_name","start_position","end_position","strand","band","gene_biotype") mart = useMart(biomart="ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl",host="feb2014.archive.ensembl.org") # Using Gencode v19 annotations geneAnno1 = getBM(attributes = getinfo,filters=c("chromosome_name"),values=c(seq(1,22,by=1),"X"),mart=mart) geneAnno1 = geneAnno1[geneAnno1[,"gene_biotype"]=="protein_coding",]; geneAnno1 = geneAnno1[!duplicated(geneAnno1[,"hgnc_symbol"]) & geneAnno1[,"hgnc_symbol"]!="",] ## 19163 with hgnc symbols after removing duplicates. We use gene symbols here because the RDNV data comes in gene symbol format. Otherwise, I prefer to use ENSG IDs. ## Select out protein-coding genes FBpc = FB[FB %in% geneAnno1$ensembl_gene_id] FBpc = geneAnno1[geneAnno1$ensembl_gene_id %in% FB, ] FBensg = FBpc$ensembl_gene_id FBhgnc = FBpc$hgnc_symbol mart.hs = mart listmarths = listAttributes(mart.hs) hommart = listmarths[grep("homolog", listmarths$name),1] # homologs mart allpc = geneAnno1[!(geneAnno1$ensembl_gene_id %in% FB), ] allensg = allpc$ensembl_gene_id allhgnc = allpc$hgnc_symbol FBdnds_mouse = c() FBdiffdnds = c() for(i in 1:length(FBensg)){ dn = getBM(attributes = "mmusculus_homolog_dn", filters = "ensembl_gene_id", values = FBensg[i], mart = mart.hs) ds = getBM(attributes = "mmusculus_homolog_ds", filters = "ensembl_gene_id", values = FBensg[i], mart = mart.hs) if(dim(dn)[1]!=0 & dim(ds)[1]!=0){ if(dim(dn)[1]==dim(ds)[1]){ FBdnds_mouse = rbind(FBdnds_mouse, c(FBensg[i], dn/ds)) print(i) }else{ FBdiffdnds = c(FBdiffdnds, i) } } } FBDdnds_mouse = c() FBDdiffdnds = c() for(i in 1:length(FBDensg)){ dn = getBM(attributes = "mmusculus_homolog_dn", filters = "ensembl_gene_id", values = FBDensg[i], mart = mart.hs) ds = getBM(attributes = "mmusculus_homolog_ds", filters = "ensembl_gene_id", values = FBDensg[i], mart = mart.hs) if(dim(dn)[1]!=0 & dim(ds)[1]!=0){ if(dim(dn)[1]==dim(ds)[1]){ FBDdnds_mouse = rbind(FBDdnds_mouse, c(FBDensg[i], dn/ds)) print(i) }else{ FBDdiffdnds = c(FBDdiffdnds, i) } } } alldnds_mouse = c() alldiffdnds = c() for(i in 1:length(allensg)){ dn = getBM(attributes = "mmusculus_homolog_dn", filters = "ensembl_gene_id", values = allensg[i], mart = mart.hs) ds = getBM(attributes = "mmusculus_homolog_ds", filters = "ensembl_gene_id", values = allensg[i], mart = mart.hs) if(dim(dn)[1]!=0 & dim(ds)[1]!=0){ if(dim(dn)[1]==dim(ds)[1]){ alldnds_mouse = rbind(alldnds_mouse, c(allensg[i], dn/ds)) print(i) }else{ alldiffdnds = c(alldiffdnds, i) } } } FBdnds = as.numeric(unlist(FBdnds_mouse[,2])) FBdnds = FBdnds[!is.na(FBdnds)]; FBdnds = FBdnds[FBdnds!=0 & FBdnds!="Inf"] Adnds = as.numeric(unlist(alldnds_mouse[,2])) Adnds = Adnds[!is.na(Adnds)]; Adnds = Adnds[Adnds!=0 & Adnds!="Inf"] logFBdnds = log2(FBdnds) logAdnds = log2(Adnds) ks.test(logFBdnds, logAdnds) densFB = density(log2(Pdnds)) densA = density(log2(Adnds)) xlim = range(-10,2); ylim = range(0,densFB$y, densA$y) Hcol = adjustcolor("darkorange1", 0.6) Lcol = adjustcolor("dodgerblue1", 0.6) pdf(file="density_of_dnds_mouse.pdf", width=6, height=2.9) par(mfrow=c(1,2),mar=c(3,3,1,1)) plot(densFB, xlim=xlim, ylim=ylim, main ='Distribution', xlab="", ylab="",cex.main=1.1, cex.axis=1.0, xaxt="n") polygon(densFB, density = -1, col = Hcol) polygon(densA, density = -1, col = Lcol) axis(1, at=seq(-10,2,by=2), las=1) mtext(side=1, text ='log2(dN/dS)', line=2, cex=1.1) mtext(side=2, text ='Density', line=2, cex=1.1) legend('topleft',c('HAR','All'), fill = c(Hcol, Lcol), bty = 'n', border = NA, cex=1.0) dev.off() #################################################################### ## 02/04/2019 ## Codes by Hyejung Won: hyejung_won@med.unc.edu ## Fisher's exact test #################################################################### options(stringsAsFactors = FALSE) library(biomaRt) hargene = unlist(read.table("HARgene.txt")) # evolutionary gene sets in hgnc symbol diseasegene = unlist(read.table("Diseasegene.txt")) # genes that are associated with a certain disorder getinfo = c("ensembl_gene_id","hgnc_symbol","chromosome_name","start_position","end_position","strand","band","gene_biotype") mart = useMart(biomart="ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl",host="feb2014.archive.ensembl.org") # Using Gencode v19 annotations geneAnno1 = getBM(attributes = getinfo,filters=c("chromosome_name"),values=c(seq(1,22,by=1),"X"),mart=mart) allgene= geneAnno1$hgnc_symbol exomeLength = read.table("exomelength.txt", header=T) ## Downloaded from the Supplementary Table of Parikshak et al., 2013 metaMat = exomeLength[match(rownames(metaMat),exomeLength$hgnc),"exomelength"] rownames(metaMat) = geneAnno1$hgnc_symbol colnames(metaMat) = "exomeLength" metaMat = cbind(metaMat, rep(NA, nrow(metaMat))) listname = "HAR" colnames(metaMat)[ncol(metaMat)] = listname matchlist = match(rownames(metaMat), hargene) metaMat[!is.na(matchlist), listname] = 1 metaMat[is.na(matchlist), listname] = 0 metaMat = cbind(metaMat, rep(NA, nrow(metaMat))) listname = "Disease" colnames(metaMat)[ncol(metaMat)] = listname matchlist = match(rownames(metaMat), diseasegene) metaMat[!is.na(matchlist), listname] = 1 metaMat[is.na(matchlist), listname] = 0 glm.out = glm(metaMat[,3]~metaMat[,2]+metaMat[,1],family=binomial) #################################################################### ## 02/04/2019 ## Codes by Hyejung Won: hyejung_won@med.unc.edu ## Human vs. Rhesus expression data ## Files and codes downloaded from: https://github.com/AllenBrainAtlas/DevRhesusLMD.git. #################################################################### # Load libraries library(reshape2) library(segmented) library(ggplot2) library(scatterplot3d) library(RColorBrewer) library(limma) # Load functions source(file="../src/fReorderFactorLevels.R") source(file="../src/fConvertPcdtoEventScore.R") # Try loading cached fits dev.pred.fn = "../cache/dev_expr_species/dev.expr_dev.pred_2hs.RData" try(load(dev.pred.fn), silent=TRUE) # Get genes you would like to test for evolgene = unlist(read.table("evol_gene.txt")) # Calc species expression variation across development rh.var = apply(dev.pred[["macaque"]], 1, function(x) sd(x) / mean(x)) h.var = apply(dev.pred[["human"]], 1, function(x) sd(x) / mean(x)) rat.var = apply(dev.pred[["rat"]], 1, function(x) sd(x) / mean(x)) # Let's check whether the variance differ for the genes that you selected for checkvar = function(genelist){ h.var.test = h.var[names(h.var) %in% genelist] h.var.ntest = h.var[!(names(h.var) %in% genelist)] rh.var.test = rh.var[names(rh.var) %in% genelist] rh.var.ntest = rh.var[!(names(rh.var) %in% genelist)] rat.var.test = rat.var[names(rat.var) %in% genelist] rat.var.ntest = rat.var[!(names(rat.var) %in% genelist)] wch = wilcox.test(h.var.test, h.var.ntest) wcrh = wilcox.test(rh.var.test, rh.var.ntest) wcrat = wilcox.test(rat.var.test, rat.var.ntest) th = t.test(h.var.test, h.var.ntest) trh = t.test(rh.var.test, rh.var.ntest) trat = t.test(rat.var.test, rat.var.ntest) hrhvar.test = log(h.var.test/rh.var.test) hrhvar.ntest = log(h.var.ntest/rh.var.ntest) wchrh = wilcox.test(hrhvar.test, hrhvar.ntest) thrh = t.test(hrhvar.test, hrhvar.ntest) vartestlist = list(wch, wcrh, wcrat, wchrh, th, trh, trat, thrh) names(vartestlist) = c("var_human:wilcox", "var_rhesus:wilcox", "var_rat:wilcox", "var_human_vs_rhesus:wilcox", "var_human:t-test", "var_rhesus:t-test", "var_rat:t-test", "var_human_vs_rhesus:t-test") return(vartestlist) } evol.var = checkvar(evolgene) # Let's plot developmental expression trajectories for a gene list dev.Expr = dev.expr[dev.expr$species %in% c("human", "macaque"),] dev.Expr$evol = as.factor(ifelse(dev.Expr$gene %in% evolgene, 1, 2)) ## Let's select out the Z-scores in critical epochs dev.expr.bound = dev.Expr dev.expr.bound$estage = ifelse(dev.expr.bound$escore<0.5, 0, 1) dev.expr.bound$estage = dev.expr.bound$estage + ifelse(dev.expr.bound$escore>0.5, 1, 0) dev.expr.bound$estage = dev.expr.bound$estage + ifelse(dev.expr.bound$escore>1, 1, 0) g2 = ggplot(dev.expr.bound, aes(x=evol, y=exprz)) + geom_boxplot(aes(fill=species)) + facet_wrap( ~ estage) + theme_bw() + theme(panel.grid.minor = element_blank()) + scale_color_manual(values = pal1) + scale_fill_manual(values = pal1) + labs( y="Normalized expression Z-score", x="", title = "Fetal Brain Developmental Expression: Evol vs. non-Evol") # Calculate statistics dev.expr.bound1 = dev.Expr[dev.Expr$escore>0.5 & dev.Expr$escore<1, ] dev.expr.bound1$index = paste(dev.expr.bound1$species, dev.expr.bound1$evol, sep=":") dev.expr.bound2 = dev.Expr[dev.Expr$escore<0.5, ] dev.expr.bound2$index = paste(dev.expr.bound2$species, dev.expr.bound2$evol, sep=":") aov1 = aov(exprz~index, data=dev.expr.bound1) TukeyHSD(aov1) aov2 = aov(exprz~index, data=dev.expr.bound2) TukeyHSD(aov2) g3 = ggplot(dev.expr.bound1, aes(exprz)) + geom_density(aes(fill=factor(index)), alpha=0.5) + scale_fill_manual(values = pal1) + labs( y="Density", x="Normalized expression Z-score", title="Evol vs. non-Evol") # Let's calculate Delta(Z) based on escore dev.expr.mq = dev.expr[dev.expr$species=="macaque",] dev.expr.hu = dev.expr[dev.expr$species=="human",] timepoint.mq = c(0.46,0.51,0.77) timepoint.hu = c(0.48,0.54,0.76) dev.expr.mq = dev.expr.mq[dev.expr.mq$escore %in% timepoint.mq, ] dev.expr.hu = dev.expr.hu[dev.expr.hu$escore %in% timepoint.hu, ] dev.mq = cbind(dev.expr.mq, dev.expr.hu) sum(dev.mq[,2] != dev.mq[,8]) dev.mq$zdiff = dev.mq[,11] - dev.mq[,5] dev.mq = dev.mq[,c(2,12,13)] dev.mq$evol = ifelse(dev.mq$gene %in% evolgene, "evol", "nonevol") g4 = ggplot(dev.mq, aes(zdiff)) + geom_density(aes(fill=factor(evol)), alpha=0.5) + scale_fill_manual(values = pal1) + labs( y="Density", x=expression(Delta*" Normalized expression Z-score (Human-Macaque)"), title="Evol vs. non-Evol") pdf("Evol_expression_traits.pdf", height=5, width=7) plot(g1) plot(g2) plot(g3) plot(g4) dev.off() # Load functions source(file="../src/fReorderFactorLevels.R") source(file="../src/fConvertPcdtoEventScore.R") # Load genes of interest load("/proj/hyejunglab/chr/geneAnno_allgenes.rda") hargene = unlist(read.table("HARgene.txt")) hugaingene = unlist(read.table("Hugaingene.txt")) hlegene = unlist(read.table("HLEgene.txt")) hgegene = unlist(read.table("HGEgene.txt")) # Load breakpoints for increasing genes in all species/regions bp.df = read.csv(file = "../cache/dev_expr_species/species_region_bp.csv", stringsAsFactors = FALSE) # SI Table 11 subset bp.df$bpmin = mapply(ConvertPcdtoEventScore, log2(bp.df$bp..pcd. - 1.96 * bp.df$bpse), bp.df$species) bp.df$bpmax = mapply(ConvertPcdtoEventScore, log2(bp.df$bp..pcd. + 1.96 * bp.df$bpse), bp.df$species) bp.subset = subset(bp.df, species %in% c("macaque", "human")) # & region == "ACG") bp.escore = dcast(bp.subset, region + gene ~ species, value.var = "bp..event.score.") bp.min = dcast(bp.subset, region + gene ~ species, value.var = "bpmin") bp.max = dcast(bp.subset, region + gene ~ species, value.var = "bpmax") bp.diff = data.frame(bp.escore, bp.min[, 3:4], bp.max[, 3:4], h_m_escore = bp.escore$human - bp.escore$macaque, hmin_mmax = bp.min$human - bp.max$macaque, hmax_mmin = bp.max$human - bp.min$macaque) bp.diff$sig = apply(bp.diff, 1, function(x) (as.numeric(x["h_m_escore"]) > 0 & min(as.numeric(x[c("hmin_mmax", "hmax_mmin")])) > 0) | (as.numeric(x["h_m_escore"]) < 0 & max(as.numeric(x[c("hmin_mmax", "hmax_mmin")])) < 0)) ### This file now contains breakpoints for a given gene in different speceis # Compare breakpoints based on used-defined groups bp.diff$har = ifelse(bp.diff$gene %in% hargene, "HAR", "nHAR") bp.diff$hugain = ifelse(bp.diff$gene %in% hugaingene, "HGE:FB", "nHGE") bp.diff$hge = ifelse(bp.diff$gene %in% hgegene, "HGE:AB", "nHGE") bp.diff$hle = ifelse(bp.diff$gene %in% hlegene, "HLE:AB", "nHLE") wilcox.test(h_m_escore~har, data=bp.diff) wilcox.test(h_m_escore~hugain, data=bp.diff) wilcox.test(h_m_escore~hge, data=bp.diff) wilcox.test(h_m_escore~hle, data=bp.diff) # Summarize genes that have significantly different breakpoints between human and macaque based on event scores diff.summary = data.frame() # Human early genes for (region1 in unique(bp.subset$region)) { genes = subset(bp.diff, region == region1 & sig == TRUE & h_m_escore < 0)$gene diff1 = with(subset(bp.subset, region == region1 & species == "human" & gene %in% genes), data.frame(set = "human_early", region1, Plateau = sum(! slope2sig), Decreasing = sum(slope2sig & slope2 < 0), Increasing = sum(slope2sig & slope2 > 0))) diff.summary = rbind(diff.summary, diff1) } humanearlygene = diffdec = diffinc = c() for(region1 in c("V1","ACG")){ genes = subset(bp.diff, region==region1 & sig==TRUE & h_m_escore<0)$gene humanearlygene = union(humanearlygene, genes) diffdec = union(diffdec, subset(bp.subset, region == region1 & species == "human" & gene %in% genes & slope2sig & slope2<0)$gene) diffinc = union(diffinc, subset(bp.subset, region == region1 & species == "human" & gene %in% genes & slope2sig & slope2>0)$gene) } # Run Fisher's exact test bgenes = unique(bp.diff$gene) # These genes show distinct changes in expression trajectories, or breakpoints. A total of 179 increasing and 179 decreasing genes met these criteria in all three species harbg = intersect(hargene, bgenes) harhuearly = intersect(humanearlygene, hargene) fisher.test(matrix(c(length(harhuearly), length(setdiff(harbg, harhuearly)), length(setdiff(humanearlygene, harhuearly)), length(setdiff(bgenes, union(harbg, humanearlygene)))),2,2)) hugainbg = intersect(hugaingene, bgenes) hugainhuearly = intersect(humanearlygene, hugaingene) fisher.test(matrix(c(length(hugainhuearly), length(setdiff(hugainbg, hugainhuearly)), length(setdiff(humanearlygene, hugainhuearly)), length(setdiff(bgenes, union(hugainbg, humanearlygene)))),2,2)) # Human late genes for (region1 in unique(bp.subset$region)) { genes = subset(bp.diff, region == region1 & sig == TRUE & h_m_escore > 0)$gene diff1 = with(subset(bp.subset, region == region1 & species == "human" & gene %in% genes), data.frame(set = "human_late", region1, Plateau = sum(! slope2sig), Decreasing = sum(slope2sig & slope2 < 0), Increasing = sum(slope2sig & slope2 > 0))) diff.summary = rbind(diff.summary, diff1) } diff.summaryl = melt(diff.summary, id = c("set", "region1")) colnames(diff.summaryl)[3:4] = c("slope_after_breakpoint", "num_genes") num.genes = diff.summaryl[diff.summaryl$set == "human_early", "num_genes"] diff.summaryl[diff.summaryl$set == "human_early", "num_genes"] = -num.genes diff.summaryl$region1 = factor(diff.summaryl$region1, levels = c("STR", "AM", "HP", "V1", "ACG"))