# Script detailing crucial parts of the statistical analysis of the supplementary data for 
# One size fits all?
# Direct evidence for the heterogeneity of genetic drift throughout the genome. 
# Biology Letters, 2016

# Script to be deposited together with dataset as supplementary material.
# A pdf version of the script and output is also made available for check. 

# Thomas Bataillon & Belen Jimenez Mena, June 16th 2016.
# Updated March 29th 2015 by TB to add regression analysis & visual checks.
# Updated April 14th 2015 by TB to add last checks on the data and main text stats.
# Updated Sep 2015 by TB and BJM to add extra tests.
# Updated April-May 2016 by TB and BJM to add extra tests.

#### Importing the dataset of Ne per window####
# Please specify the path to the directory containing the data below
setwd("/Users/tbata/Dropbox/Research/Ne_CowGenome/4. Resubmission_BiologyLetters_042016/dataset_and_analysis/100SNPs_analysis/")
setwd("/Users/bjm/Dropbox/Ne_CowGenome/5. Final resubmission_02062016/updated_dryad/")


#### Importing DATA of bins from the 1995-2000, 2000-2005 and 1995-2005 time intervals####
cowBins=read.table(file = "NeGenomicWindows100SNPs.txt", 
                   sep=" ", header=T)

head(cowBins) # a few lines from the data
names(cowBins) # names of columns, see description below

#### Meta-Data for the dataset of Ne per window ####

# chrm: autosome or origin for the window (1  to 29)
# bin: genomic window number (dummy variable not relevant for the analysis)
# SizeBin: size of the window in numbers of nucleotides (bp)
# Ne_1995_2000: realized drift effective size in the window for the time period 1995->2000
# SE_1995_2000: SE around the Ne estimate (based on 10000 bootstraps) (see ESM for more information)
# Ne_2000_2005: realized drift effective size in the window for the time period 2000->2005
# SE_2000_2005: SE around the Ne estimate (based on 10000 bootstraps) (see ESM for more information)
# Ne_1995_2005: realized drift effective size in the window for the time period 1995->2005
# SE_1995_2005: SE around the Ne estimate (based on 10000 bootstraps) (see ESM for more information)
# No_SS: Number of selective sweeps in the window 0, No sweep, 1, 2, 3 sweeps reported in a window
# Length_SS: Length of selective sweeps in window in numbers of nucleotides (bp)
# GeneContent_bp: Number of bp that consist in predicted genes in the window
# LRR: local recombination rate (centi Morgan per Megabase)
# Code_QTL: code describing if a QTL was detected in the window for economic traits (see supp methods), 
# 0: no QTL, 1,2,3 resp QTL detected for 1,2,3 traits


# taking out chromosome 30 (sexual chromosome) from the analysis:
cowBins <- cowBins[cowBins$Chromosome!=30,]
tail(cowBins) # returns the last bit of the matrix

# Making sure chromosome variable is a factor
cowBins$Chromosome= as.factor(cowBins$Chromosome)

#### Data checks and Table 2####
dim(cowBins) # dimension of the data

summary(cowBins$Ne_1995_2000) # media and median of Ne for this specific interval
summary(cowBins$SE_1995_2000) # SE for this specific interval
summary(cowBins$Ne_2000_2005)
summary(cowBins$SE_2000_2005)
summary(cowBins$Ne_1995_2005)
summary(cowBins$SE_1995_2005)

table(cowBins$Chromosome)

summary(lm(Ne_1995_2000~Chromosome, data=cowBins))
summary(lm(Ne_2000_2005~Chromosome, data=cowBins))
summary(lm(Ne_1995_2005~Chromosome, data=cowBins))

boxplot(Ne_1995_2000~Chromosome, data=cowBins)

summary(cowBins$SizeBin/(10^6))

boxplot(logNe~Chromosome, data=cowBins)
abline(h=median(cowBins$logNe))

#### Relation between SE and estimated Ne ####

plot(cowBins$Ne_1995_2000,cowBins$SE_1995_2000, xlab= "Realized Ne in a Bin", ylab= "SE on realized Ne", xlim=c(0, 150), ylim=c(0,50), pch=19, cex=0.2, col="grey30", cex.axis=1.4)
LocalRegNeSE=lowess(cowBins$Ne_1995_2000,cowBins$SE_1995_2000, f = 0.2)
lines(LocalRegNeSE, lwd=2, lty=1, col="magenta")

#### Rough local linear fit
subsetNe=subset(cowBins,cowBins$Ne_1995_2000 <70 ) #<70 SE are much smaller
dim(subsetNe) #325 bins out of 472
VeryLocalreg=lm(SE_1995_2000~Ne_1995_2000, data= subsetNe)
VeryLocalreg$coefficients
#abline(VeryLocalreg, col="cornflowerblue",lwd=2)

predictSEs=function(paramNe) { #returns the expected SE as a function of underlying Ne
  return(max(5,(-2.24 +0.3*paramNe))) ##SE is at least 5
}
xs<-seq(15, 120, by = 1)
predictSEs(62)
ys=rep(0,length(xs))
for (i in 1:length(xs)) ys[i]=predictSEs(xs[i])
lines(xs,ys, col="cornflowerblue", lwd=3, lty=1)


####simulation of variation between bins ####
#data
hist(cowBins$Ne_1995_2000,breaks = 25, freq = F, ylim=c(0,0.025), xlim=c(0,300), xlab="Realized Ne in bins with 100 SNps", main="")
# H0: 472 bins simulated with median Ne and corresponding SE

summary(cowBins$Ne_1995_2000)
mean(cowBins$Ne_1995_2000) # mean of the dataset
median(cowBins$Ne_1995_2000) # median of the dataset - reported in main text
sqrt(var(cowBins$Ne_1995_2000)) # sd of the dataset - reported in main text
predictSEs(60.37) # SE corresponding to the mean of the dataset

# We generate a distribution with the same characteristics as my dataset
# and we test how it fits
lines(density(rnorm(n = 447,mean = 60.37, sd = 16),bw = 16), lwd=2, lty=3) 
## H0 single Ne with sampling variance estimated from the data

observedSDAmongBins=sqrt(var(cowBins$Ne_1995_2000)) # 
H0Mean=mean(cowBins$Ne_1995_2000)
median(cowBins$Ne_1995_2000)
H0sd=37 ##generous and chosen given the empirical relationship derived between realized Ne and SE on Ne. #15

#Simulation of the null distribution for the observed SD among bins
H0SD=rep(-1,100000)
for (i in 1: 100000) {
  myPseudoBins=rnorm(n = 447, mean=H0Mean, sd=16.3 ) 
  H0SD[i]=sqrt(var(myPseudoBins)) 
}
summary(H0SD)
hist(H0SD) # The distribution of the SD I obtained from 10000 simulations 

##HA generating Ne in realized bins with some extra heterogeneity 
# (true underlying Ne variation)
# generating variation in true underlying Ne values
# From the Ne estimated from my model from the data, we add two diff error noises:
# 1) we estimate a SE associated with the Ne(obtained from the empirical relationship Ne vs SE)
# 2) + another SE that has a distribution that follows a Normal
# We add these two "noises" to the estimation of Ne

#NeDisPerbins=rnorm(n = 447, mean = 60, sd = 20) ##Normal distribution
#hist(NeDisPerbins)
nSims= 1000000 ##number of simulated bins to approximate the true distribution
NeDisPerbins= rlnorm(nSims, meanlog = 3.74, sdlog = 0.55) # underlying true variation in Ne
hist(NeDisPerbins)
summary(NeDisPerbins) # mean and median quite similar to the data; just to confirm
sqrt(var(NeDisPerbins) )
mean(NeDisPerbins)
#obtaining the expected error on each bin (following the empirical relationship Ne vs SE)
SEsForEachBins=rep(-1,nSims)
for(i in 1:nSims) SEsForEachBins[i]=predictSEs(NeDisPerbins[i]) 
#generating the pseudo data as the sum of true Ne + sampling error
SimulatedNePerBins=NeDisPerbins+SEsForEachBins
sqrt(var(SimulatedNePerBins))
mean(SimulatedNePerBins)
(observedSDAmongBins=sqrt(var(cowBins$Ne_1995_2000)))
(mean(cowBins$Ne_1995_2000))



#graphical overview of the fit of H0 and adjusted distribution + empirical dis ####
hist(cowBins$Ne_1995_2000,breaks = 40,freq = F, ylim=c(0,0.025), xlim=c(0,300),
     xlab="Estimated Ne in windows with 100 SNPs", main="", 
     col="grey70", cex.axis=0.8, cex.lab=1, lty=0, las=1, ylab="Density")
HAf=density(SimulatedNePerBins,bw = 4)
lines(HAf$x,HAf$y, lwd=3, lty=1, col="grey20", xlim=c(70,200)) 

## HA var in Ne with sampling variance estimated from the data

# Adding H0 distribution single Ne with sampling variance estimated from the data
HoDeviates=rnorm(n = nSims,mean = 60.37, sd = 16)
summary(HoDeviates)
xs=seq(from=0,to=200, by=0.1)
yH0=dnorm(x=xs,mean  = 60.37, sd = 16)
lines(xs,yHo, lwd=2.5,lty=2)
lines(density(HoDeviates,bw = 4), lwd=3, lty=2, xlim=c(150,200))


#### Linear models to explore variation in Ne ####

cowBins$logNe_1995_2000 <- log(cowBins$Ne_1995_2000) # working on the log of the Ne to do regressions
cowBins$logNe_2000_2005 <- log(cowBins$Ne_2000_2005)
cowBins$logNe_1995_2005 <- log(cowBins$Ne_1995_2005)


hist(cowBins$logNe_1995_2000) 
shapiro.test(cowBins$logNe_1995_2000) ## it is NOT normal

m0<-lm(logNe_1995_2000~1, data= cowBins)
m1<-lm(logNe_1995_2000~Chromosome, data= cowBins)
summary(m1)

anova(m0,m1) # pvalue and F estimate for Table 1
anova(m0,m1,test = "Chisq")

plot(m1)
shapiro.test(m1$residuals)  

head(cowBins)

#### Regression analysis accounting for the heteroscedasticity of sampling variance ####
# contrasting model with all explaining variable with sub models
# Test for each interval (1995-2000, 2000-2005, 1995-2005)

cowBins$propGene <- cowBins$GeneContent_bp/cowBins$SizeBin # proportion of genes in window
cowBins$propSS <- cowBins$Length_SS/cowBins$SizeBin # proportion of selective sweep in window

# filter the data to remove the value (only 1) that is NA in LRR:
cowBins <- subset(cowBins,!is.na(LRR))

dim(cowBins)

# 1995-2000
m0_95_00_null <-lm(logNe_1995_2000~1, data= cowBins, weights=1/(cowBins$SE_1995_2000))
m1_95_00_chr <-lm(logNe_1995_2000~Chromosome, data= cowBins, weights=1/(cowBins$SE_1995_2000)) 
anova(m0_95_00_null, m1_95_00_chr) # TABLE S2 -> pvalue
                                   # Table 1 -> pvalue and F test for chromosome effect

#Full model including chrm, local rec rate (LRR), gene density (propGene), past sweeps, QTLs
m3_95_00_other <- lm(logNe_1995_2000~Chromosome+ LRR + SizeBin + propGene + No_SS + Code_QTL, data= cowBins, weights=1/(cowBins$SE_1995_2000))
summary(m3_95_00_other) # SS is significant
                        # Table 1 -> estimate and SE
anova(m3_95_00_other,m1_95_00_chr) # TABLE S2-> pvalue


#2000-2005
m0_00_05_null <-lm(logNe_2000_2005~1, data= cowBins, weights=1/(cowBins$SE_2000_2005))
m1_00_05_chr <-lm(logNe_2000_2005~Chromosome, data= cowBins, weights=1/(cowBins$SE_2000_2005)) 
anova(m0_00_05_null, m1_00_05_chr) # TABLE S2

#Full model including chrm, local rec rate (LRR), gene density (propGene), past sweeps, QTLs
m3_00_05_other <- lm(logNe_2000_2005~Chromosome + LRR + SizeBin+ propGene + No_SS+ Code_QTL, data= cowBins, weights=1/(cowBins$SE_2000_2005))
summary(m3_00_05_other)
anova(m3_00_05_other,m1_00_05_chr) # TABLE S2

#1995-2005
m0_95_05_null <-lm(logNe_1995_2005~1, data= cowBins, weights=1/(cowBins$SE_1995_2005))
m1_95_05_chr <-lm(logNe_1995_2005~Chromosome, data= cowBins, weights=1/(cowBins$SE_1995_2005)) 
anova(m0_95_05_null, m1_95_05_chr) # TABLE S2
#Full model including chrm, local rec rate (LRR), gene density (propGene), past sweeps, QTLs
m3_95_05_other <- lm(logNe_1995_2005~Chromosome+ LRR+ SizeBin+ propGene + No_SS+ Code_QTL, data= cowBins, weights=1/(cowBins$SE_1995_2005))
summary(m3_95_05_other)
anova(m3_95_05_other, m1_95_05_chr) # TABLE S2


#using the m1 model as background to test specific extra factors
# Exploring each covariate alone: 
# null model:
model0_95_00 <- lm(logNe_1995_2000~Chromosome, data = cowBins, weights=1/(cowBins$SE_1995_2000))

# SelectiveSweep
model1SS_95_00 <- lm(logNe_1995_2000~Chromosome+ No_SS, data = cowBins, weights=1/(cowBins$SE_1995_2000))
anova(model1SS_95_00, model0_95_00) # Table 1 -> pvalue and F test
#LRR 
model1LRR_95_00 <- lm(logNe_1995_2000~Chromosome + LRR, data = cowBins, weights=1/(cowBins$SE_1995_2000))
anova(model1LRR_95_00, model0_95_00) # Table 1 -> pvalue and F test
#GeneContent
model1Gene_95_00 <- lm(logNe_1995_2000~Chromosome+ propGene, data = cowBins, weights=1/(cowBins$SE_1995_2000))
anova(model1Gene_95_00, model0_95_00) # Table 1 -> pvalue and F test
#SizeBin
model1size_95_00 <- lm(logNe_1995_2000~Chromosome+ SizeBin, data = cowBins, weights=1/(cowBins$SE_1995_2000))
anova(model1size_95_00, model0_95_00) # Table 1 -> pvalue and F test
#QTL:
model1QTL_95_00 <- lm(logNe_1995_2000~Chromosome+ Code_QTL, data = cowBins, weights=1/(cowBins$SE_1995_2000))
anova(model1QTL_95_00, model0_95_00) # Table 1 -> pvalue and F test


#### Selective sweeps and Ne ####
table(cowBins$No_SS)

#contrasting the Ne in zero versus 1, 2 or more SS

cowBins$No_SS <- replace(cowBins$No_SS, cowBins$No_SS>3, 3) # there are No_SS=4, so we add them to the category SS>3
table(cowBins$No_SS)
boxplot(logNe~No_SS, data=cowBins)

hist(cowBins$logNe, class=20, xlab="Realized Ne per window")

NeWithSS=subset(cowBins, cowBins$No_SS>0)
NeNoSS <- subset(cowBins, cowBins$No_SS==0)

wilcox.test(NeNoSS$Ne_1995_2000, NeWithSS$Ne_1995_2000, alternative = "less")
wilcox.test(NeNoSS$logNe, NeWithSS$logNe, alternative = "less") # pvalue = 0.56

boxplot(logNe~No_SS, data=cowBins, xlab="Number of Selective sweeps per window",
        ylab="log (realized Ne)", varwidth = TRUE,  xaxt="n")
axis(1, at=c(1:4), labels=c(expression("0", '1','2',paste(phantom(0) >= phantom(0),"3")))) # phantom is used to create the symbol ">="

# Correlation with QTL:
#### QTL: Distribution of Ne in bins with / without QTL ####
table(cowBins$Code_QTL)

sum(table(cowBins$Code_QTL))
boxplot(logNe~Code_QTL, data=cowBins, xlab="Number of QTLs per bin", ylab="log Ne", varwidth = TRUE)

table(cowBins$Code_QTL)
hist(cowBins$logNe, class=20, xlab="Realized Ne per window")
bin3=subset(cowBins, cowBins$Code_QTL==3)
dim(bin3)
head(bin3)
allbinsBut3=subset(cowBins, cowBins$Code_QTL<3)
dim(allbinsBut3)
head(allbinsBut3)
rug(bin3$logNe,  lwd=2, col="red")
wilcox.test(bin3$logNe, allbinsBut3$logNe, alternative = "less")

# same analysis but with presence of >=2 QTL
bin1orMore=subset(cowBins, cowBins$Code_QTL>=1)
dim(bin1orMore)
head(bin1orMore)
allbinsBut2orMore=subset(cowBins, cowBins$Code_QTL<1)
dim(allbinsBut2orMore)
head(allbinsBut2orMore)
rug(bin2orMore$logNe,  lwd=2, col="red")
wilcox.test(bin1orMore$logNe, allbinsBut2orMore$logNe, alternative = "less")


#### LRR and Ne ####
mLRR_95_00 <- lm(Ne_1995_2000~LRR, data=cowBins)
summary(mLRR_95_00)
cor.test(cowBins$LRR,cowBins$Ne_1995_2000)

# Figure S2
plot(Ne_1995_2000~LRR, data=cowBins, xlab="Local Recombination Rate per window",
     ylab = "Estimated Ne per window", cex=0.3)
mLRR_95_00$coefficients
abline(mLRR_95_00, col="cornflowerblue",lwd=2)


#### PropGene and Ne ####
mGene_95_00 <- lm(Ne_1995_2000~propGene, data=cowBins)
summary(mGene_95_00)
cor.test(cowBins$propGene,cowBins$Ne_1995_2000)
plot(Ne_1995_2000~propGene, data=cowBins, xlab="Proportion of genes per window",
     ylab = "Estimated Ne per window", cex=0.3)
plot(Ne_1995_2000~propGene, data=cowBins)
mGene_95_00$coefficients
abline(mGene_95_00, col="cornflowerblue",lwd=2)


#### Ne chromosome variation: ####


cowChrm = read.table(file = "NeGenomicChromosome.txt", 
                   sep=" ", header=T)

#### Meta-Data for the dataset of Ne chromosome variation ####

# Chromosome: autosome or origin for the window (1 to 29)
# SizeBin: size of the chromosome analysed in numbers of nucleotides (bp)
# Ne_1995_2000: realized drift effective size in the chromosome for the time period 1995->2000
# SE_1995_2000: SE around the Ne estimate (based on 10000 bootstraps)
# Ne_2000_2005: realized drift effective size in the chromosome for the time period 2000->2005
# SE_2000_2005: SE around the Ne estimate (based on 10000 bootstraps)
# Ne_1995_2005: realized drift effective size in the chromosome for the time period 1995->2005
# SE_1995_2005: SE around the Ne estimate (based on 10000 bootstraps)

head(cowChrm)

# taking out chromosome 30:
cowChrm <- cowChrm[cowChrm$Chromosome!=30,]

summary(cowChrm$Ne_1995_2000) # main text - mean, median, max, min

#### Session info to maximize reproducibility of the analysis ####
sessionInfo()