## Supporting information S2 File

## This is the R code for the analyis of following research article: 
## Title: "Who infects Whom": Reconstructing infection chains of Mycobacterium avium subsp. paratuberculosis in an 
##        endemically infected dairy herd by use of genomic data
## by: Annette Nigsch, Suelee Robbe-Austerman, Tod Stuber, Paulina D. Pavinski Bitar, Yrjö Gröhn and Ynte H. Schukken



## 1.) PREPARATIONS

## for details on SeqTrack, see SeqTrack helpfile and Jombart et al. (2011), Heredity 106, 383-390.

## load packages
library("ape")
library("adegenet")
library("igraph")
library("ggplot2")
library("plyr")

## set working directory on your computer 
## setwd("C:/Users/.../MAP")

## load the SNP sequence data
snp <-read.dna("MAP.fasta", format="fa")


## load case file
# columns:  id = UID, label = sequence ID, cow = cow ID, birthd = birth date of cow, 
#           shedd = potential start of the (genotype-specific) infectious period, 
cases <- read.csv("MAP_cases.CSV")


## define the columms with dates as GMT dates in POSIXct format.
birth <- as.POSIXct(cases$birthd, tz = "GMT") 
shed <- as.POSIXct(cases$shedd, tz = "GMT")  # time zone (tz) in Universal Time, Coordinated (UTC), which is GMT


## create distance matrix
distmat <- dist.dna(snp, model="N", pairwise.deletion = TRUE, as.matrix = TRUE)
write.csv(distmat, "DistMat_pairwDeletion.csv")


## define length of nucleotide sequence and mutation rate mu 
nbNucl <- ncol(as.matrix(snp))
mu_MAP <- 2.91e-08*2/365    ## 0.25 substitutions per genome per year


## load weighting matrices for Exposure [E] and Susceptibility [S] scenarios
# MAP_E.csv: cells A1:DX1 = isolate ID, cells A2:DX129 = number of exposure days between pairs of isolates [E]  
# MAP_S.csv: cells A1:DX1 = isolate ID, cells A2:DX129 = weights from 6 to 0 for seven social network pattern [S] 
Matrix_E <- read.csv("MAP_E.csv")
Matrix_S <- read.csv("MAP_S.csv")


## convert data into matrices
M_E <- as.matrix(Matrix_E)
M_S <- as.matrix(Matrix_S)




## 2.) RECONSTRUCTION OF MAP TRANSMISSION TREES WITH SEQTRACK

## select time stamp for the scenario to calculate (shed or birth), start with "birth".
# then repeat lines 67 - 91 with "shed" and continue in line 99.
Date <- birth # shed


## run SeqTrack analysis for [Basic], [E] and [S] scenarios 
res_Basic <- seqTrack(distmat, x.names=cases$label, x.dates=Date, mu=mu_MAP, haplo.le=nbNucl)
res_E <- seqTrack(distmat, x.names=cases$label, x.dates=Date, prox.mat=M_E, mu=mu_MAP, haplo.le=nbNucl)
res_S <- seqTrack(distmat, x.names=cases$label, x.dates=Date, prox.mat=M_S, mu=mu_MAP, haplo.le=nbNucl)


## calculate statistical support for inferred ancestries 
p_Basic <- get.likelihood(res_Basic, mu=mu_MAP, haplo.length=nbNucl)
p_E <- get.likelihood(res_E, mu=mu_MAP, haplo.length=nbNucl)
p_S <- get.likelihood(res_S, mu=mu_MAP, haplo.length=nbNucl)


# replace all ancestors with weight > "x" with NA. In the article we used x = 6. 
x <- 6
res_Basic$ances[res_Basic$weight > x] <- NA
res_E$ances[res_E$weight > x] <- NA
res_S$ances[res_S$weight > x] <- NA


## create summary files: "ares" contains ancestries, "ap" contains p-values of statistical support for ancestries  
ares <- data.frame(res_Basic$ances, res_E$ances, res_S$ances)
ap <- data.frame(p_Basic, p_E, p_S)


## save time stamp-specific summary files, begin with "birth"
ares_birth <- ares
ap_birth <- ap

# then repeat lines 67 - 91 with "shed" and continue in line 99, save "shed" summary files.
ares_shed <- ares
ap_shed <- ap


## graphs
## plot pairwise genomic distance of all snp sequences (Fig. 2 in article)
hist <- hist(distmat,  col="lightgrey", nclass=250, xlim = c(0,250), ylim = c(0,1200), 
             main="Distribution of pairwise genomic distances",
             xlab="Number of differing SNPs")

## plot SeqTrack transmission trees once with "birth" and once with "shed" as time stamp (Fig. 3. Fig. 4, Fig. S.1 in article)
res_graph <- as.igraph(res_Basic)
tkplot(res_graph, vertex.color="white") # layout transmission tree in newly opened window

res_graph <- as.igraph(res_E)
tkplot(res_graph, vertex.color="white")

res_graph <- as.igraph(res_S)
tkplot(res_graph, vertex.color="white")


## 3.) ANALYSIS OF RECONSTRUCTED MAP TRANSMISSION TREES

## create table with number of descending isolates and number of offspring
# select time stamp for the scenario to calculate (shed or birth), start with "ares_birth".
# repeat lines 125 - 138 with "ares_shed" and continue in line 146.
a <- ares_birth  # ares_shed
ancesfreq_Basic <- as.data.frame(table(a$res_Basic))
ancesfreq_E <- as.data.frame(table(a$res_E))
ancesfreq_S <- as.data.frame(table(a$res_S))

cas <- data.frame(cow=cases$cow, MAP_id=cases$id)
cas$freq_Basic = ancesfreq_Basic$Freq[match(cases$id, ancesfreq_Basic$Var1)]  # gives number of descending isolates at isolate level
cas$freq_E = ancesfreq_E$Freq[match(cases$id, ancesfreq_E$Var1)]
cas$freq_S = ancesfreq_S$Freq[match(cases$id, ancesfreq_S$Var1)]
cas[is.na(cas)] <- 0                                                          # replaces all "NA" by 0

cow_freq_Basic <- count(cas, "cow", "freq_Basic")                             # gives number of descending isolates at cow level
cow_freq_E <- count(cas, "cow", "freq_E")
cow_freq_S <- count(cas, "cow", "freq_S")


cow_freqb <- data.frame(cow=cow_freq_Basic$cow, b_Basic=cow_freq_Basic$freq, b_E= cow_freq_E$freq, 
                        b_S= cow_freq_S$freq)
cas_birth <- cas

# repeat lines 125 - 138 with "ares_shed" and continue in line 146, save "shed" results.
cow_freqs <- data.frame(cow=cow_freq_Basic$cow, s_Basic=cow_freq_Basic$freq, s_E= cow_freq_E$freq,
                        s_S= cow_freq_S$freq)
cas_shed <- cas

## save results for data handling and further analysis in R or in another software
# data handling outside R: accout for potential right and left censoring of data by using 
# a subsample of all isolates to investigate the role of an individual cow in MAP spread (see 4.) 
cow_freqbs <- data.frame(cow_freqb, cow_freqs)
cas_bs <- data.frame(b=cas_birth, e=cas_shed)

write.csv(cas_bs, "cas_bs.csv")
write.csv(cow_freqbs, "cow_freqbs.csv")


## 4.) CORRELATION BETWEEN RECONSTRUCTED NUMBER OF OFFSPRING AND DISEASE PHENOTYPES

## load cow disease phenotypes and concatenate them to final (adapted) table with number of offspring per cow.
# format of table with cow disease phenotypes: "cow_id", "disease phenotypes" in binary or categorical coding.
# make sure that both tables are sorted in the same order of cows.
pheno <- read.csv("MAP_disease_phenotypes.CSV")
cow_freqbsx <- read.csv("Cow_freqbs_adapt.CSV")
phenodf <- data.frame(pheno, cow_freqbsx)
write.csv(phenodf, "Cow_freqbsx_phenodf.csv")

## multifactorial analysis: correlate number of offspring (by scenario) with disease phenotype
# categorical coded phenotype - example: "shedding level"
cor.test(phenodf$b_Basic,phenodf$Sheddinglevel, method="spearman")
cor.test(phenodf$b_E,phenodf$Sheddinglevel, method="spearman")
cor.test(phenodf$b_S,phenodf$Sheddinglevel, method="spearman")
cor.test(phenodf$s_Basic,phenodf$Sheddinglevel, method="spearman")
cor.test(phenodf$s_E,phenodf$Sheddinglevel, method="spearman")
cor.test(phenodf$s_S,phenodf$Sheddinglevel, method="spearman")

# boxplots with numbers of offspring produced by individual cows, by disease phenotype and scenario (Fig. 5 in article)
par(mfrow = c(1,6))
boxplot(phenodf$b_Basic~phenodf$Sheddinglevel,
        main="Shedding Level",
        ylab="Offspring (n)")
boxplot(phenodf$b_E~phenodf$Sheddinglevel)
boxplot(phenodf$b_S~phenodf$Sheddinglevel)
boxplot(phenodf$s_Basic~phenodf$Sheddinglevel)
boxplot(phenodf$s_E~phenodf$Sheddinglevel)
boxplot(phenodf$s_S~phenodf$Sheddinglevel)

# binary coded phenotype - example: "SeroStatus"
t.test(phenodf$b_Basic~phenodf$SeroStatus)
t.test(phenodf$b_E~phenodf$SeroStatus)
t.test(phenodf$b_S~phenodf$SeroStatus)
t.test(phenodf$s_Basic~phenodf$SeroStatus)
t.test(phenodf$s_E~phenodf$SeroStatus)
t.test(phenodf$s_S~phenodf$SeroStatus)

boxplot(phenodf$b_Basic~phenodf$SeroStatus,
        main="Serostatus",
        ylab="Offspring (n)")
boxplot(phenodf$b_E~phenodf$SeroStatus)
boxplot(phenodf$b_S~phenodf$SeroStatus)
boxplot(phenodf$s_Basic~phenodf$SeroStatus)
boxplot(phenodf$s_E~phenodf$SeroStatus)
boxplot(phenodf$s_S~phenodf$SeroStatus)


# end