# Script detailing crucial parts of the statistical analysis of the supplementary data for # One size fits all? # Direct evidence for the heterogeneity of genetic drift throughout the genome. # Biology Letters, 2016 # Script to be deposited together with dataset as supplementary material. # A pdf version of the script and output is also made available for check. # Thomas Bataillon & Belen Jimenez Mena, June 16th 2016. # Updated March 29th 2015 by TB to add regression analysis & visual checks. # Updated April 14th 2015 by TB to add last checks on the data and main text stats. # Updated Sep 2015 by TB and BJM to add extra tests. # Updated April-May 2016 by TB and BJM to add extra tests. #### Importing the dataset of Ne per window#### # Please specify the path to the directory containing the data below setwd("/Users/tbata/Dropbox/Research/Ne_CowGenome/4. Resubmission_BiologyLetters_042016/dataset_and_analysis/100SNPs_analysis/") setwd("/Users/bjm/Dropbox/Ne_CowGenome/5. Final resubmission_02062016/updated_dryad/") #### Importing DATA of bins from the 1995-2000, 2000-2005 and 1995-2005 time intervals#### cowBins=read.table(file = "NeGenomicWindows100SNPs.txt", sep=" ", header=T) head(cowBins) # a few lines from the data names(cowBins) # names of columns, see description below #### Meta-Data for the dataset of Ne per window #### # chrm: autosome or origin for the window (1 to 29) # bin: genomic window number (dummy variable not relevant for the analysis) # SizeBin: size of the window in numbers of nucleotides (bp) # Ne_1995_2000: realized drift effective size in the window for the time period 1995->2000 # SE_1995_2000: SE around the Ne estimate (based on 10000 bootstraps) (see ESM for more information) # Ne_2000_2005: realized drift effective size in the window for the time period 2000->2005 # SE_2000_2005: SE around the Ne estimate (based on 10000 bootstraps) (see ESM for more information) # Ne_1995_2005: realized drift effective size in the window for the time period 1995->2005 # SE_1995_2005: SE around the Ne estimate (based on 10000 bootstraps) (see ESM for more information) # No_SS: Number of selective sweeps in the window 0, No sweep, 1, 2, 3 sweeps reported in a window # Length_SS: Length of selective sweeps in window in numbers of nucleotides (bp) # GeneContent_bp: Number of bp that consist in predicted genes in the window # LRR: local recombination rate (centi Morgan per Megabase) # Code_QTL: code describing if a QTL was detected in the window for economic traits (see supp methods), # 0: no QTL, 1,2,3 resp QTL detected for 1,2,3 traits # taking out chromosome 30 (sexual chromosome) from the analysis: cowBins <- cowBins[cowBins$Chromosome!=30,] tail(cowBins) # returns the last bit of the matrix # Making sure chromosome variable is a factor cowBins$Chromosome= as.factor(cowBins$Chromosome) #### Data checks and Table 2#### dim(cowBins) # dimension of the data summary(cowBins$Ne_1995_2000) # media and median of Ne for this specific interval summary(cowBins$SE_1995_2000) # SE for this specific interval summary(cowBins$Ne_2000_2005) summary(cowBins$SE_2000_2005) summary(cowBins$Ne_1995_2005) summary(cowBins$SE_1995_2005) table(cowBins$Chromosome) summary(lm(Ne_1995_2000~Chromosome, data=cowBins)) summary(lm(Ne_2000_2005~Chromosome, data=cowBins)) summary(lm(Ne_1995_2005~Chromosome, data=cowBins)) boxplot(Ne_1995_2000~Chromosome, data=cowBins) summary(cowBins$SizeBin/(10^6)) boxplot(logNe~Chromosome, data=cowBins) abline(h=median(cowBins$logNe)) #### Relation between SE and estimated Ne #### plot(cowBins$Ne_1995_2000,cowBins$SE_1995_2000, xlab= "Realized Ne in a Bin", ylab= "SE on realized Ne", xlim=c(0, 150), ylim=c(0,50), pch=19, cex=0.2, col="grey30", cex.axis=1.4) LocalRegNeSE=lowess(cowBins$Ne_1995_2000,cowBins$SE_1995_2000, f = 0.2) lines(LocalRegNeSE, lwd=2, lty=1, col="magenta") #### Rough local linear fit subsetNe=subset(cowBins,cowBins$Ne_1995_2000 <70 ) #<70 SE are much smaller dim(subsetNe) #325 bins out of 472 VeryLocalreg=lm(SE_1995_2000~Ne_1995_2000, data= subsetNe) VeryLocalreg$coefficients #abline(VeryLocalreg, col="cornflowerblue",lwd=2) predictSEs=function(paramNe) { #returns the expected SE as a function of underlying Ne return(max(5,(-2.24 +0.3*paramNe))) ##SE is at least 5 } xs<-seq(15, 120, by = 1) predictSEs(62) ys=rep(0,length(xs)) for (i in 1:length(xs)) ys[i]=predictSEs(xs[i]) lines(xs,ys, col="cornflowerblue", lwd=3, lty=1) ####simulation of variation between bins #### #data hist(cowBins$Ne_1995_2000,breaks = 25, freq = F, ylim=c(0,0.025), xlim=c(0,300), xlab="Realized Ne in bins with 100 SNps", main="") # H0: 472 bins simulated with median Ne and corresponding SE summary(cowBins$Ne_1995_2000) mean(cowBins$Ne_1995_2000) # mean of the dataset median(cowBins$Ne_1995_2000) # median of the dataset - reported in main text sqrt(var(cowBins$Ne_1995_2000)) # sd of the dataset - reported in main text predictSEs(60.37) # SE corresponding to the mean of the dataset # We generate a distribution with the same characteristics as my dataset # and we test how it fits lines(density(rnorm(n = 447,mean = 60.37, sd = 16),bw = 16), lwd=2, lty=3) ## H0 single Ne with sampling variance estimated from the data observedSDAmongBins=sqrt(var(cowBins$Ne_1995_2000)) # H0Mean=mean(cowBins$Ne_1995_2000) median(cowBins$Ne_1995_2000) H0sd=37 ##generous and chosen given the empirical relationship derived between realized Ne and SE on Ne. #15 #Simulation of the null distribution for the observed SD among bins H0SD=rep(-1,100000) for (i in 1: 100000) { myPseudoBins=rnorm(n = 447, mean=H0Mean, sd=16.3 ) H0SD[i]=sqrt(var(myPseudoBins)) } summary(H0SD) hist(H0SD) # The distribution of the SD I obtained from 10000 simulations ##HA generating Ne in realized bins with some extra heterogeneity # (true underlying Ne variation) # generating variation in true underlying Ne values # From the Ne estimated from my model from the data, we add two diff error noises: # 1) we estimate a SE associated with the Ne(obtained from the empirical relationship Ne vs SE) # 2) + another SE that has a distribution that follows a Normal # We add these two "noises" to the estimation of Ne #NeDisPerbins=rnorm(n = 447, mean = 60, sd = 20) ##Normal distribution #hist(NeDisPerbins) nSims= 1000000 ##number of simulated bins to approximate the true distribution NeDisPerbins= rlnorm(nSims, meanlog = 3.74, sdlog = 0.55) # underlying true variation in Ne hist(NeDisPerbins) summary(NeDisPerbins) # mean and median quite similar to the data; just to confirm sqrt(var(NeDisPerbins) ) mean(NeDisPerbins) #obtaining the expected error on each bin (following the empirical relationship Ne vs SE) SEsForEachBins=rep(-1,nSims) for(i in 1:nSims) SEsForEachBins[i]=predictSEs(NeDisPerbins[i]) #generating the pseudo data as the sum of true Ne + sampling error SimulatedNePerBins=NeDisPerbins+SEsForEachBins sqrt(var(SimulatedNePerBins)) mean(SimulatedNePerBins) (observedSDAmongBins=sqrt(var(cowBins$Ne_1995_2000))) (mean(cowBins$Ne_1995_2000)) #graphical overview of the fit of H0 and adjusted distribution + empirical dis #### hist(cowBins$Ne_1995_2000,breaks = 40,freq = F, ylim=c(0,0.025), xlim=c(0,300), xlab="Estimated Ne in windows with 100 SNPs", main="", col="grey70", cex.axis=0.8, cex.lab=1, lty=0, las=1, ylab="Density") HAf=density(SimulatedNePerBins,bw = 4) lines(HAf$x,HAf$y, lwd=3, lty=1, col="grey20", xlim=c(70,200)) ## HA var in Ne with sampling variance estimated from the data # Adding H0 distribution single Ne with sampling variance estimated from the data HoDeviates=rnorm(n = nSims,mean = 60.37, sd = 16) summary(HoDeviates) xs=seq(from=0,to=200, by=0.1) yH0=dnorm(x=xs,mean = 60.37, sd = 16) lines(xs,yHo, lwd=2.5,lty=2) lines(density(HoDeviates,bw = 4), lwd=3, lty=2, xlim=c(150,200)) #### Linear models to explore variation in Ne #### cowBins$logNe_1995_2000 <- log(cowBins$Ne_1995_2000) # working on the log of the Ne to do regressions cowBins$logNe_2000_2005 <- log(cowBins$Ne_2000_2005) cowBins$logNe_1995_2005 <- log(cowBins$Ne_1995_2005) hist(cowBins$logNe_1995_2000) shapiro.test(cowBins$logNe_1995_2000) ## it is NOT normal m0<-lm(logNe_1995_2000~1, data= cowBins) m1<-lm(logNe_1995_2000~Chromosome, data= cowBins) summary(m1) anova(m0,m1) # pvalue and F estimate for Table 1 anova(m0,m1,test = "Chisq") plot(m1) shapiro.test(m1$residuals) head(cowBins) #### Regression analysis accounting for the heteroscedasticity of sampling variance #### # contrasting model with all explaining variable with sub models # Test for each interval (1995-2000, 2000-2005, 1995-2005) cowBins$propGene <- cowBins$GeneContent_bp/cowBins$SizeBin # proportion of genes in window cowBins$propSS <- cowBins$Length_SS/cowBins$SizeBin # proportion of selective sweep in window # filter the data to remove the value (only 1) that is NA in LRR: cowBins <- subset(cowBins,!is.na(LRR)) dim(cowBins) # 1995-2000 m0_95_00_null <-lm(logNe_1995_2000~1, data= cowBins, weights=1/(cowBins$SE_1995_2000)) m1_95_00_chr <-lm(logNe_1995_2000~Chromosome, data= cowBins, weights=1/(cowBins$SE_1995_2000)) anova(m0_95_00_null, m1_95_00_chr) # TABLE S2 -> pvalue # Table 1 -> pvalue and F test for chromosome effect #Full model including chrm, local rec rate (LRR), gene density (propGene), past sweeps, QTLs m3_95_00_other <- lm(logNe_1995_2000~Chromosome+ LRR + SizeBin + propGene + No_SS + Code_QTL, data= cowBins, weights=1/(cowBins$SE_1995_2000)) summary(m3_95_00_other) # SS is significant # Table 1 -> estimate and SE anova(m3_95_00_other,m1_95_00_chr) # TABLE S2-> pvalue #2000-2005 m0_00_05_null <-lm(logNe_2000_2005~1, data= cowBins, weights=1/(cowBins$SE_2000_2005)) m1_00_05_chr <-lm(logNe_2000_2005~Chromosome, data= cowBins, weights=1/(cowBins$SE_2000_2005)) anova(m0_00_05_null, m1_00_05_chr) # TABLE S2 #Full model including chrm, local rec rate (LRR), gene density (propGene), past sweeps, QTLs m3_00_05_other <- lm(logNe_2000_2005~Chromosome + LRR + SizeBin+ propGene + No_SS+ Code_QTL, data= cowBins, weights=1/(cowBins$SE_2000_2005)) summary(m3_00_05_other) anova(m3_00_05_other,m1_00_05_chr) # TABLE S2 #1995-2005 m0_95_05_null <-lm(logNe_1995_2005~1, data= cowBins, weights=1/(cowBins$SE_1995_2005)) m1_95_05_chr <-lm(logNe_1995_2005~Chromosome, data= cowBins, weights=1/(cowBins$SE_1995_2005)) anova(m0_95_05_null, m1_95_05_chr) # TABLE S2 #Full model including chrm, local rec rate (LRR), gene density (propGene), past sweeps, QTLs m3_95_05_other <- lm(logNe_1995_2005~Chromosome+ LRR+ SizeBin+ propGene + No_SS+ Code_QTL, data= cowBins, weights=1/(cowBins$SE_1995_2005)) summary(m3_95_05_other) anova(m3_95_05_other, m1_95_05_chr) # TABLE S2 #using the m1 model as background to test specific extra factors # Exploring each covariate alone: # null model: model0_95_00 <- lm(logNe_1995_2000~Chromosome, data = cowBins, weights=1/(cowBins$SE_1995_2000)) # SelectiveSweep model1SS_95_00 <- lm(logNe_1995_2000~Chromosome+ No_SS, data = cowBins, weights=1/(cowBins$SE_1995_2000)) anova(model1SS_95_00, model0_95_00) # Table 1 -> pvalue and F test #LRR model1LRR_95_00 <- lm(logNe_1995_2000~Chromosome + LRR, data = cowBins, weights=1/(cowBins$SE_1995_2000)) anova(model1LRR_95_00, model0_95_00) # Table 1 -> pvalue and F test #GeneContent model1Gene_95_00 <- lm(logNe_1995_2000~Chromosome+ propGene, data = cowBins, weights=1/(cowBins$SE_1995_2000)) anova(model1Gene_95_00, model0_95_00) # Table 1 -> pvalue and F test #SizeBin model1size_95_00 <- lm(logNe_1995_2000~Chromosome+ SizeBin, data = cowBins, weights=1/(cowBins$SE_1995_2000)) anova(model1size_95_00, model0_95_00) # Table 1 -> pvalue and F test #QTL: model1QTL_95_00 <- lm(logNe_1995_2000~Chromosome+ Code_QTL, data = cowBins, weights=1/(cowBins$SE_1995_2000)) anova(model1QTL_95_00, model0_95_00) # Table 1 -> pvalue and F test #### Selective sweeps and Ne #### table(cowBins$No_SS) #contrasting the Ne in zero versus 1, 2 or more SS cowBins$No_SS <- replace(cowBins$No_SS, cowBins$No_SS>3, 3) # there are No_SS=4, so we add them to the category SS>3 table(cowBins$No_SS) boxplot(logNe~No_SS, data=cowBins) hist(cowBins$logNe, class=20, xlab="Realized Ne per window") NeWithSS=subset(cowBins, cowBins$No_SS>0) NeNoSS <- subset(cowBins, cowBins$No_SS==0) wilcox.test(NeNoSS$Ne_1995_2000, NeWithSS$Ne_1995_2000, alternative = "less") wilcox.test(NeNoSS$logNe, NeWithSS$logNe, alternative = "less") # pvalue = 0.56 boxplot(logNe~No_SS, data=cowBins, xlab="Number of Selective sweeps per window", ylab="log (realized Ne)", varwidth = TRUE, xaxt="n") axis(1, at=c(1:4), labels=c(expression("0", '1','2',paste(phantom(0) >= phantom(0),"3")))) # phantom is used to create the symbol ">=" # Correlation with QTL: #### QTL: Distribution of Ne in bins with / without QTL #### table(cowBins$Code_QTL) sum(table(cowBins$Code_QTL)) boxplot(logNe~Code_QTL, data=cowBins, xlab="Number of QTLs per bin", ylab="log Ne", varwidth = TRUE) table(cowBins$Code_QTL) hist(cowBins$logNe, class=20, xlab="Realized Ne per window") bin3=subset(cowBins, cowBins$Code_QTL==3) dim(bin3) head(bin3) allbinsBut3=subset(cowBins, cowBins$Code_QTL<3) dim(allbinsBut3) head(allbinsBut3) rug(bin3$logNe, lwd=2, col="red") wilcox.test(bin3$logNe, allbinsBut3$logNe, alternative = "less") # same analysis but with presence of >=2 QTL bin1orMore=subset(cowBins, cowBins$Code_QTL>=1) dim(bin1orMore) head(bin1orMore) allbinsBut2orMore=subset(cowBins, cowBins$Code_QTL<1) dim(allbinsBut2orMore) head(allbinsBut2orMore) rug(bin2orMore$logNe, lwd=2, col="red") wilcox.test(bin1orMore$logNe, allbinsBut2orMore$logNe, alternative = "less") #### LRR and Ne #### mLRR_95_00 <- lm(Ne_1995_2000~LRR, data=cowBins) summary(mLRR_95_00) cor.test(cowBins$LRR,cowBins$Ne_1995_2000) # Figure S2 plot(Ne_1995_2000~LRR, data=cowBins, xlab="Local Recombination Rate per window", ylab = "Estimated Ne per window", cex=0.3) mLRR_95_00$coefficients abline(mLRR_95_00, col="cornflowerblue",lwd=2) #### PropGene and Ne #### mGene_95_00 <- lm(Ne_1995_2000~propGene, data=cowBins) summary(mGene_95_00) cor.test(cowBins$propGene,cowBins$Ne_1995_2000) plot(Ne_1995_2000~propGene, data=cowBins, xlab="Proportion of genes per window", ylab = "Estimated Ne per window", cex=0.3) plot(Ne_1995_2000~propGene, data=cowBins) mGene_95_00$coefficients abline(mGene_95_00, col="cornflowerblue",lwd=2) #### Ne chromosome variation: #### cowChrm = read.table(file = "NeGenomicChromosome.txt", sep=" ", header=T) #### Meta-Data for the dataset of Ne chromosome variation #### # Chromosome: autosome or origin for the window (1 to 29) # SizeBin: size of the chromosome analysed in numbers of nucleotides (bp) # Ne_1995_2000: realized drift effective size in the chromosome for the time period 1995->2000 # SE_1995_2000: SE around the Ne estimate (based on 10000 bootstraps) # Ne_2000_2005: realized drift effective size in the chromosome for the time period 2000->2005 # SE_2000_2005: SE around the Ne estimate (based on 10000 bootstraps) # Ne_1995_2005: realized drift effective size in the chromosome for the time period 1995->2005 # SE_1995_2005: SE around the Ne estimate (based on 10000 bootstraps) head(cowChrm) # taking out chromosome 30: cowChrm <- cowChrm[cowChrm$Chromosome!=30,] summary(cowChrm$Ne_1995_2000) # main text - mean, median, max, min #### Session info to maximize reproducibility of the analysis #### sessionInfo()