## Get necessary packages install.packages(c("devtools", "ggplot2", "gtools", "plyr", "gdata")) library(devtools) install_github(user="behuang", repo="mpMap") library(mpMap) library(gtools) library(plyr) library(ggplot2) library(gdata) ## Functions for computing map uncertainty whichrow<-function(mat,x){ # Helper function, returns which row of a matrix match a given vector which(apply(mat,1,function(y){all(y==x)})) } get_geno <- function(obj){ if (class(obj)[1]=="ri4self"){ N <- nrow(obj$geno[[1]]$data) founders <- obj$founderGeno n.mar <- ncol(founders) geno <- obj$geno[[1]]$truegeno finals <- sapply(1:n.mar, function(m) founders[geno[,m],m]) } else if (class(obj)[1]=="mpcross"){ finals <- obj$finals } return(finals) } gen_N <- function(object, markers=1:3){ if (class(object)[1]=="riself"){ finals <- pull.geno(object)-1 } else { finals <- get_geno(object) } finals <- finals[,markers] finals.df <- as.data.frame(finals) finals.df <- na.omit(finals.df) padding <- as.data.frame(permutations(2, 3, c(1,0), repeats=TRUE)) names(padding) <- names(finals.df) finals.df <- rbind(finals.df, padding) count.df <- count(finals.df, names(finals.df)) N <- count.df$freq-1 return(N) } pt3_v2 <- function(g, theta){ # Return the three point probability of a particular genotype # eg pt3(g=c("A", "B", "C"), theta=c(0.1,0.2)) returns the probability # of observing A-B-C given theta_12 = 0.1 and theta_23 = 0.2 x <- as.numeric(mapvalues(g, c("A", "B", "C", "D"), c(0,0,1,1), warn_missing=FALSE)) # whether the selfing process produces a recombinant genotype or not # Ie treating the data as a 2-way RIL, grouping {A and B} and {C and D} ri <- abs(diff(x)) # whether or not recombination occured in the initial cross x2 <- as.numeric(mapvalues(g, c("A", "B", "C", "D"), 1:4, warn_missing=FALSE)) rb <- as.numeric(diff(x2)!=0) R <- 2*theta/(1+2*theta) if (all(ri == c(0,0))){ p <- 1/4*(2-R[1]-R[2]-R[3])*(1-theta[1])^{1-rb[1]}* (theta[1]^{rb[1]})*(theta[2]^{rb[2]})*(1-theta[2])^{1-rb[2]}*1/2 } else if (all(ri==c(0,1))){ p <- 1/4*(-R[1]+R[2]+R[3])*(1-theta[1])^{1-rb[1]}*theta[1]^{rb[1]}*1/4 } else if (all(ri==c(1,0))){ p <- 1/4*(R[1]-R[2]+R[3])*(1-theta[2])^{1-rb[2]}*theta[2]^{rb[2]}*1/4 } else if (all(ri==c(1,1))){ d <- as.numeric(g[1]!=g[3]) p <- 1/4*(R[1]+R[2]-R[3])*(1-theta[3])^{1-d}*(theta[3])^{d}*1/4 } else{ stop('Genotype Not Recognised') } return(p) } prob_array <- function(founders, theta){ prob.out <- array(0, c(8, nrow(theta))) perms <- permutations(2, 3, c(0,1), repeats=TRUE) geno <- permutations(4, 3, LETTERS[1:4], repeats=T) theta <- cbind(theta, theta[,1]+theta[,2]-2*theta[,1]*theta[,2]) for (i in 1:nrow(geno)){ g <- geno[i,] prob <- apply(theta, 1, function(val){pt3_v2(g, val)}) snp <- sapply(1:3, function(k){founders[g[k], k]}) addto <- whichrow(perms, snp) prob.out[addto, ] <- prob.out[addto, ]+prob } return (prob.out) } loglk3 <- function(theta,N, founders){ probs <- as.vector(prob_array(founders, t(theta))) loglike <- sum(N*log(probs), na.rm=T) return(loglike) } like3 <- function(obj, markers, rf.hat){ # Maximises 3 point likelihood for a triple of markers # rf.hat is a matrix of pairwise recombination fractions needed for # the initial estimates in the optim function # Returns a vector of length 3 # first two elements are the MLEs of theta_adjacent # last element is the maximised likelihood N <- gen_N(obj, markers) if (class(obj)[1]=="ri4self"){ founders <- obj$founderGeno[,markers] } else if (class(obj)[1]=="mpcross"){ founders <- obj$founders[,markers] } initial <-c(rf.hat[markers[1], markers[2]], rf.hat[markers[2], markers[3]]) # Function limits have to be set to 0 < theta < 0.5 (note 0 <= theta <= 0.5 resuts in errors) opt.obj <- optim(initial, loglk3, N=N, founders=founders, control=list(fnscale=-1), method="L-BFGS-B", lower=c(0.00001,0.00001), upper=c(0.49999,0.49999)) out <- c(opt.obj$par, opt.obj$value) return(out) } gen_coef <- function(sdp.1, sdp.2){ phen <- permutations(2, 2, c(0,1), repeats=TRUE) A <- numeric(4) B <- numeric(4) for (i in 1:nrow(phen)){ p <- phen[i,] G1 <- which(sdp.1 == p[1]) G2 <- which(sdp.2 == p[2]) is.recombinant <- outer(G1, G2, "==") A[i] <- sum(is.recombinant) B[i] <- length(is.recombinant)- A[i] } out <- cbind(A, B) colnames(out) <- c("A", "B") return(out) } prob3 <- function(obj, mrks, theta=NULL){ # Returns a data frame giving the probability of observing each # three point SNP phenotype at a triple of markers # Input object can be a mpcross object an Rqtl ri4self object or a matrix of founder SDPS if (class(obj)[1]=="mpcross"){ obj <- subset(obj, markers=mrks) map <- unlist(obj$map) founders <- obj$founders } else if(class(obj)[1]=="ri4self"){ map <- unlist(obj$geno[[1]]$map) obj <- pull.markers(obj,names(map)[mrks]) founders <- obj$founderGeno[,] rownames(founders) <- LETTERS[1:4] } else if (class(obj)[1]=="matrix"){ founders <- obj rownames(founders) <- LETTERS[1:4] } if (is.null(theta) & class(obj)[1] %in% c("mpcross", "ri4self")){ D <- dist(map) theta <- haldaneX2R(as.matrix(D)/100) } if (class(theta)!="matrix"){ theta.mat <- matrix(0, 3,3) theta.mat[rbind(c(1,2), c(2,1))] <- theta[1] theta.mat[rbind(c(2,3), c(3,2))] <- theta[2] theta.mat[rbind(c(1,3), c(3,1))] <- theta[1]+theta[2]-2*theta[1]*theta[2] theta <- theta.mat } perms <- permutations(2, 3, c(0,1), repeats=TRUE) perms <- apply(perms, 1,paste, collapse="") prob.df <- data.frame(pattern=perms, prob=0) geno <- permutations(4, 3, LETTERS[1:4], repeats=T) for (i in 1:nrow(geno)){ g <- geno[i,] prob <- pt3(g, theta) snp <- paste(sapply(1:3, function(k){founders[g[k], k]}), collapse="") addto <- match(snp, prob.df$pattern) prob.df[addto, "prob"] <- prob.df[addto, "prob"] + prob } return(prob.df) } pt3 <- function(g, theta){ # Return the three point probability of a particular genotype # eg pt3(g=c("A", "B", "C"), theta=c(0.1,0.2)) returns the probability # of observing A-B-C given theta_12 = 0.1 and theta_23 = 0.2 x <- as.numeric(mapvalues(g, c("A", "B", "C", "D"), c(0,0,1,1), warn_missing=FALSE)) # whether or not "recombination" occured in the selfing process # Ie treating the data as a 2-way RIL, grouping {A and B} and {C and D} ri <- abs(diff(x)) x2 <- as.numeric(mapvalues(g, c("A", "B", "C", "D"), 1:4, warn_missing=FALSE)) # whether or not recombination occured in the initial cross rb <- as.numeric(diff(x2)!=0) R <- 2*theta/(1+2*theta) if (all(ri == c(0,0))){ p <- 1/4*(2-R[1,2]-R[2,3]-R[1,3])*(1-theta[1,2])^{1-rb[1]}* (theta[1,2]^{rb[1]})*(theta[2,3]^{rb[2]})*(1-theta[2,3])^{1-rb[2]}*1/2 } else if (all(ri==c(0,1))){ p <- 1/4*(-R[1,2]+R[2,3]+R[1,3])*(1-theta[1,2])^{1-rb[1]}*theta[1,2]^{rb[1]}*1/4 } else if (all(ri==c(1,0))){ p <- 1/4*(R[1,2]-R[2,3]+R[1,3])*(1-theta[2,3])^{1-rb[2]}*theta[2,3]^{rb[2]}*1/4 } else if (all(ri==c(1,1))){ d <- as.numeric(g[1]!=g[3]) p <- 1/4*(R[1,2]+R[2,3]-R[1,3])*(1-theta[1,3])^{1-d}*(theta[1,3])^{d}*1/4 } else{ stop('Genotype Not Recognised') } return(p) } ## Calculation of covariance for recombination fractions within a triplet gen_cov_v2 <- function(obj, markers, theta.mat){ if (class(obj)[1]=="mpcross"){ obj <- subset(obj, markers=markers) founders <- obj$founders[, 1:3] map <- obj$map } else if(class(obj)[1]=="ri4self"){ map <- pull.map(obj) obj <- pull.markers(obj, names(map[[1]])[markers]) founders <- obj$founderGeno[,1:3] } else if (class(obj)[1]=="matrix"){ founders <- obj } n.mar <- 3 coef.12 <- gen_coef(founders[,1], founders[,2]) A_12 <- coef.12[,1] B_12 <- coef.12[,2] coef.23 <- gen_coef(founders[,2], founders[,3]) A_23 <- coef.23[,1] B_23 <- coef.23[,2] coef.13 <- gen_coef(founders[,1], founders[,3]) A_13 <- coef.13[,1] B_13 <- coef.13[,2] theta <- theta.mat[rbind(1:2,2:3, c(1,3))] t_12 <- theta[1] t_23 <- theta[2] t_13 <- theta[3] K_12_11 <- (3*A_12[4]-B_12[4])/((1+2*t_12)*((-1+t_12)*A_12[4]-t_12*B_12[4])) K_12_10 <- (3*A_12[3]-B_12[3])/((1+2*t_12)*((-1+t_12)*A_12[3]-t_12*B_12[3])) K_12_01 <- (3*A_12[2]-B_12[2])/((1+2*t_12)*((-1+t_12)*A_12[2]-t_12*B_12[2])) K_12_00 <- (3*A_12[1]-B_12[1])/((1+2*t_12)*((-1+t_12)*A_12[1]-t_12*B_12[1])) K_23_11 <- (3*A_23[4]-B_23[4])/((1+2*t_23)*((-1+t_23)*A_23[4]-t_23*B_23[4])) K_23_10 <- (3*A_23[3]-B_23[3])/((1+2*t_23)*((-1+t_23)*A_23[3]-t_23*B_23[3])) K_23_01 <- (3*A_23[2]-B_23[2])/((1+2*t_23)*((-1+t_23)*A_23[2]-t_23*B_23[2])) K_23_00 <- (3*A_23[1]-B_23[1])/((1+2*t_23)*((-1+t_23)*A_23[1]-t_23*B_23[1])) K_13_11 <- (3*A_13[4]-B_13[4])/((1+2*t_13)*((-1+t_13)*A_13[4]-t_13*B_13[4])) K_13_10 <- (3*A_13[3]-B_13[3])/((1+2*t_13)*((-1+t_13)*A_13[3]-t_13*B_13[3])) K_13_01 <- (3*A_13[2]-B_13[2])/((1+2*t_13)*((-1+t_13)*A_13[2]-t_13*B_13[2])) K_13_00 <- (3*A_13[1]-B_13[1])/((1+2*t_13)*((-1+t_13)*A_13[1]-t_13*B_13[1])) H_12_11 <- -(3*A_12[4]-B_12[4])*((-1+4*t_12)*A_12[4]-(1+4*t_12)*B_12[4])/((1+2*t_12)^2*((-1+t_12)*A_12[4]-t_12*B_12[4])^2) H_12_10 <- -(3*A_12[3]-B_12[3])*((-1+4*t_12)*A_12[3]-(1+4*t_12)*B_12[3])/((1+2*t_12)^2*((-1+t_12)*A_12[3]-t_12*B_12[3])^2) H_12_01 <- -(3*A_12[2]-B_12[2])*((-1+4*t_12)*A_12[2]-(1+4*t_12)*B_12[2])/((1+2*t_12)^2*((-1+t_12)*A_12[2]-t_12*B_12[2])^2) H_12_00 <- -(3*A_12[1]-B_12[1])*((-1+4*t_12)*A_12[1]-(1+4*t_12)*B_12[1])/((1+2*t_12)^2*((-1+t_12)*A_12[1]-t_12*B_12[1])^2) H_23_11 <- -(3*A_23[4]-B_23[4])*((-1+4*t_23)*A_23[4]-(1+4*t_23)*B_23[4])/((1+2*t_23)^2*((-1+t_23)*A_23[4]-t_23*B_23[4])^2) H_23_10 <- -(3*A_23[3]-B_23[3])*((-1+4*t_23)*A_23[3]-(1+4*t_23)*B_23[3])/((1+2*t_23)^2*((-1+t_23)*A_23[3]-t_23*B_23[3])^2) H_23_01 <- -(3*A_23[2]-B_23[2])*((-1+4*t_23)*A_23[2]-(1+4*t_23)*B_23[2])/((1+2*t_23)^2*((-1+t_23)*A_23[2]-t_23*B_23[2])^2) H_23_00 <- -(3*A_23[1]-B_23[1])*((-1+4*t_23)*A_23[1]-(1+4*t_23)*B_23[1])/((1+2*t_23)^2*((-1+t_23)*A_23[1]-t_23*B_23[1])^2) H_13_11 <- -(3*A_13[4]-B_13[4])*((-1+4*t_13)*A_13[4]-(1+4*t_13)*B_13[4])/((1+2*t_13)^2*((-1+t_13)*A_13[4]-t_13*B_13[4])^2) H_13_10 <- -(3*A_13[3]-B_13[3])*((-1+4*t_13)*A_13[3]-(1+4*t_13)*B_13[3])/((1+2*t_13)^2*((-1+t_13)*A_13[3]-t_13*B_13[3])^2) H_13_01 <- -(3*A_13[2]-B_13[2])*((-1+4*t_13)*A_13[2]-(1+4*t_13)*B_13[2])/((1+2*t_13)^2*((-1+t_13)*A_13[2]-t_13*B_13[2])^2) H_13_00 <- -(3*A_13[1]-B_13[1])*((-1+4*t_13)*A_13[1]-(1+4*t_13)*B_13[1])/((1+2*t_13)^2*((-1+t_13)*A_13[1]-t_13*B_13[1])^2) r_12 <- t_12/(4*(1+2*t_12)) r_23 <- t_23/(4*(1+2*t_23)) r_13 <- t_13/(4*(1+2*t_13)) nr_12 <- (1-t_12)/(4*(1+2*t_12)) nr_23 <- (1-t_23)/(4*(1+2*t_23)) nr_13 <- (1-t_13)/(4*(1+2*t_13)) P_12_11 <- A_12[4]*nr_12+B_12[4]*r_12 P_12_10 <- A_12[3]*nr_12+B_12[3]*r_12 P_12_01 <- A_12[2]*nr_12+B_12[2]*r_12 P_12_00 <- A_12[1]*nr_12+B_12[1]*r_12 P_23_11 <- A_23[4]*nr_23+B_23[4]*r_23 P_23_10 <- A_23[3]*nr_23+B_23[3]*r_23 P_23_01 <- A_23[2]*nr_23+B_23[2]*r_23 P_23_00 <- A_23[1]*nr_23+B_23[1]*r_23 P_13_11 <- A_13[4]*nr_13+B_13[4]*r_13 P_13_10 <- A_13[3]*nr_13+B_13[3]*r_13 P_13_01 <- A_13[2]*nr_13+B_13[2]*r_13 P_13_00 <- A_13[1]*nr_13+B_13[1]*r_13 R_12 <- 8*r_12 R_13 <- 8*r_13 R_23 <- 8*r_23 prob.df <- prob3(obj, 1:3, theta.mat) P_000 <- subset(prob.df, pattern=="000", select=prob, drop=TRUE) P_001 <- subset(prob.df, pattern=="001", select=prob, drop=TRUE) P_010 <- subset(prob.df, pattern=="010", select=prob, drop=TRUE) P_011 <- subset(prob.df, pattern=="011", select=prob, drop=TRUE) P_100 <- subset(prob.df, pattern=="100", select=prob, drop=TRUE) P_101 <- subset(prob.df, pattern=="101", select=prob, drop=TRUE) P_110 <- subset(prob.df, pattern=="110", select=prob, drop=TRUE) P_111 <- subset(prob.df, pattern=="111", select=prob, drop=TRUE) c_12_23 <- K_12_11*K_23_11*(P_111-P_12_11*P_23_11) + K_12_11*K_23_10*(P_110-P_12_11*P_23_10) + K_12_11*K_23_01*(-P_12_11*P_23_01) + K_12_11*K_23_00*(-P_12_11*P_23_00) + K_12_10*K_23_11*(-P_12_10*P_23_11)+ K_12_10*K_23_10*(-P_12_10*P_23_10) + K_12_10*K_23_01*(P_101-P_12_10*P_23_01) + K_12_10*K_23_00*(P_100-P_12_10*P_23_00) + K_12_01*K_23_11*(P_011-P_12_01*P_23_11) + K_12_01*K_23_10*(P_010-P_12_01*P_23_10) + K_12_01*K_23_01*(-P_12_01*P_23_01) + K_12_01*K_23_00*(-P_12_01*P_23_00) + K_12_00*K_23_11*(-P_12_00*P_23_11) + K_12_00*K_23_10*(-P_12_00*P_23_10) + K_12_00*K_23_01*(P_001-P_12_00*P_23_01) + K_12_00*K_23_00*(P_000-P_12_00*P_23_00) c_12_13 <- K_12_11*K_13_11*(P_111-P_12_11*P_13_11)+K_12_11*K_13_10*(P_110-P_12_11*P_13_10) + K_12_11*K_13_01*(-P_12_11*P_13_01)+ K_12_11*K_13_00*(-P_12_11*P_13_00)+ K_12_10*K_13_11*(P_101-P_12_10*P_13_11) + K_12_10*K_13_10*(P_100-P_12_10*P_13_10)+ K_12_10*K_13_01*(-P_12_10*P_13_01)+K_12_10*K_13_00*(-P_12_10*P_13_00) + K_12_01*K_13_11*(-P_12_01*P_13_11) + K_12_01*K_13_10*(-P_12_01*P_13_10) + K_12_01*K_13_01*(P_011-P_12_01*P_13_01) + K_12_01*K_13_00*(P_010-P_12_01*P_13_00) + K_12_00*K_13_11*(-P_12_00*P_13_11) +K_12_00*K_13_10*(-P_12_00*P_13_10) + K_12_00*K_13_01*(P_001-P_12_00*P_13_01) + K_12_00*K_13_00*(P_000-P_12_00*P_13_00) c_13_23 <- K_13_11*K_23_11*(P_111-P_13_11*P_23_11)+K_13_11*K_23_10*(-P_13_11*P_23_10) + K_13_11*K_23_01*(P_101 - P_13_11*P_23_01) + K_13_11*K_23_00*(-P_13_11*P_23_00) + K_13_10*K_23_11*(-P_13_10*P_23_11) + K_13_10*K_23_10*(P_110-P_13_10*P_23_10) + K_13_10*K_23_01*(-P_13_10*P_23_01) + K_13_10*K_23_00*(P_100 - P_13_10*P_23_00) + K_13_01*K_23_11*(P_011-P_13_01*P_23_11) + K_13_01*K_23_10*(-P_13_01*P_23_10) + K_13_01*K_23_01*(P_001-P_13_01*P_23_01) + K_13_01*K_23_00*(-P_13_01*P_23_00) + K_13_00*K_23_11*(-P_13_00*P_23_11) + K_13_00*K_23_10*(P_010-P_13_00*P_23_10) + K_13_00*K_23_01*(-P_13_00*P_23_01) + K_13_00*K_23_00*(P_000 - P_13_00*P_23_00) V_12 <- K_12_11^2*(P_12_11-P_12_11^2) + K_12_10^2*(P_12_10-P_12_10^2) + K_12_01^2*(P_12_01-P_12_01^2) + K_12_00^2*(P_12_00-P_12_00^2) + 2*(K_12_11*K_12_10*(-P_12_11*P_12_10)+K_12_11*K_12_01*(-P_12_11*P_12_01)+ K_12_11*K_12_00*(-P_12_11*P_12_00)+K_12_10*K_12_01*(-P_12_10*P_12_01)+ K_12_10*K_12_00*(-P_12_10*P_12_00)+K_12_01*K_12_00*(-P_12_01*P_12_00)) V_23 <- K_23_11^2*(P_23_11-P_23_11^2) + K_23_10^2*(P_23_10-P_23_10^2) + K_23_01^2*(P_23_01-P_23_01^2) + K_23_00^2*(P_23_00-P_23_00^2) + 2*(K_23_11*K_23_10*(-P_23_11*P_23_10)+K_23_11*K_23_01*(-P_23_11*P_23_01)+ K_23_11*K_23_00*(-P_23_11*P_23_00)+K_23_10*K_23_01*(-P_23_10*P_23_01)+ K_23_10*K_23_00*(-P_23_10*P_23_00)+K_23_01*K_23_00*(-P_23_01*P_23_00)) V_13 <- K_13_11^2*(P_13_11-P_13_11^2) + K_13_10^2*(P_13_10-P_13_10^2) + K_13_01^2*(P_13_01-P_13_01^2) + K_13_00^2*(P_13_00-P_13_00^2) + 2*(K_13_11*K_13_10*(-P_13_11*P_13_10)+K_13_11*K_13_01*(-P_13_11*P_13_01)+ K_13_11*K_13_00*(-P_13_11*P_13_00)+K_13_10*K_13_01*(-P_13_10*P_13_01)+ K_13_10*K_13_00*(-P_13_10*P_13_00)+K_13_01*K_13_00*(-P_13_01*P_13_00)) V <- matrix(0, ncol=3, nrow=3) V[1,1] <- V_12 V[2,2] <- V_23 V[3,3] <- V_13 V[rbind(c(1,2), c(2,1))] <- c_12_23 V[rbind(c(1,3), c(3,1))] <- c_12_13 V[rbind(c(2,3), c(3,2))] <- c_13_23 J <- matrix(0, ncol=3, nrow=3) H_12 <- H_12_11*P_12_11 + H_12_10*P_12_10 + H_12_01*P_12_01 + H_12_00*P_12_00 H_23 <- H_23_11*P_23_11 + H_23_10*P_23_10 + H_23_01*P_23_01 + H_23_00*P_23_00 H_13 <- H_13_11*P_13_11 + H_13_10*P_13_10 + H_13_01*P_13_01 + H_13_00*P_13_00 J[1,1] <- H_12 J[2,2] <- H_23 J[3,3] <- H_13 J <- (-1)*J Sigma <- (solve(J) %*% V %*% solve(J)) return(Sigma) } VuongTest <- function(object, triple, chr=1){ # Perform Vuong's Closeness Test at a triple of markers # Function returns a vector with OSS, test statistic, p-value and second best ordering # p-value is for a two-sided test, ie models are different # It saves time if the input object has a rf$theta component with estimated recombination fractions # Argument triple is taken to be estimated order # Best model (from 3pt likelihood) from alternate orderings is chosen as the comparison model # In the output, if alternate==1 it means that the best alternate order was triple[c(1,3,2)] # In the output, if alternate==2 it means that the best alternate order was triple[c(3,1,2)] object <- subset(object, chr=chr) if (class(object)[1]=="mpcross"){ founders <- object$founders } else if(class(object)[1]=="ri4self"){ founders <- object$founderGeno rownames(founders) <- LETTERS[1:4] } if (is.null(object$rf$theta)) rf.hat <- estim.rf(object) else rf.hat <- object$rf$theta # Calculating OSS L.alt <- numeric(2) mod1 <- like3(object, triple[1:3], rf.hat) mod2 <- like3(object, triple[c(1,3,2)], rf.hat) mod3 <- like3(object, triple[c(3,1,2)], rf.hat) L.est <- mod1[3] L.alt <- c(mod2[3], mod3[3]) OSS <- (L.est - max(L.alt)) * log(exp(1))/log(10) theta1 <- mod1[1:2] # Which is the second best model modelb <- which.max(L.alt) if (modelb==1){ theta2 <- mod2[1:2] altord <-c(1,3,2) } else { theta2 <- mod3[1:2] altord <- c(3,1,2) } # Likelihood Ratio for numerator of test statistic LR <- (L.est - max(L.alt)) # Calculating denominator of test statistic f1 <- prob3(founders[, triple], mrks=triple, theta=theta1) f2 <- prob3(founders[, triple[altord]], mrks=triple[altord], theta=theta2) perms <- permutations(2, 3, c(0,1), repeats=TRUE) geno <- get_geno(object) geno <- geno[, triple] geno <- as.data.frame(geno) geno <- na.omit(geno) N <- nrow(geno) prob1 <- numeric(N) prob2 <- numeric(N) for (i in 1:N){ yi1 <- paste0(geno[i,], collapse="") yi2 <- paste0(geno[i,altord], collapse="") ycat1 <- match(yi1, f1$pattern) ycat2 <- match(yi2, f2$pattern) prob1[i] <- f1[ycat1, "prob"] prob2[i] <- f2[ycat2, "prob"] } # Estimated standard deviation omega <- sd(log(prob1/prob2)) test.stat <- LR/(sqrt(N)*omega) p.value <- 2*pnorm(abs(test.stat), lower.tail=FALSE) output <- c(OSS, LR, test.stat, p.value, modelb) names(output) <- c("OSS", "LR", "test.stat", "p.value", "alternate") return(output) } ## Read in genotype and map data dat4 <- read.csv("FileS2.csv", skip=1, h=T, row.names=1) ## Create object for analysis ped = sim.mpped(4, 1, nrow(dat4)-5) finals=dat4[6:nrow(dat4),] founders=dat4[2:5,] mp = mpcross(finals=finals, founders=founders, pedigree=ped, id=which(ped[,4]==1), fid=1:4) mp$map=list() mp$map[["3B"]]=as.numeric(dat4[1,]) names(mp$map[["3B"]])=colnames(dat4) ## Now run the analysis for all triples on the chromosome. subobj <- mp # Taking one marker from each bin subobj <- subset(subobj, markers=which(!duplicated(subobj$map[[1]]))) subobj <- mpestrf(subobj, r=c(seq(0, 0.1, by=0.0001), seq(0.1,0.5, by=0.01))) subobj <- mpimputerf(subobj) rf.hat <- subobj$rf$theta rf.hat[rf.hat==0] <- .0001 n.mar <- ncol(subobj$finals) first.mar <- 1:(n.mar-2) triplets <- cbind(first.mar, first.mar+1, first.mar+2) supscore <- rep(NA, nrow(triplets)) post.probs <- array(0, c(nrow(triplets), 3)) p.values <- rep(NA, nrow(triplets)) L.alt <- numeric(2) n.trip <- nrow(triplets) vuong <- list() for (j in 1:3){ print(j) triple <- triplets[j,] # Calculating order support score # like3 function returns rf estimates as well as the likelihood L.cor <- like3(subobj, triple[1:3], rf.hat)[3] L.alt[1]<- like3(subobj, triple[c(1,3,2)], rf.hat)[3] L.alt[2] <- like3(subobj, triple[c(3,1,2)], rf.hat)[3] supscore[j] <- (L.cor - max(L.alt)) * log(exp(1))/log(10) # Calculating p-value # This might require estimating the recombination fractions on a very fine grid to work properly theta.hat <- c(rf.hat[triple[1],triple[2]], rf.hat[triple[2],triple[3]], rf.hat[triple[1],triple[3]]) G <- cbind(c(-1+2*theta.hat[2], -1+2*theta.hat[1], 1)) theta.rest <- rf.hat[triple, triple] theta.rest[rbind(c(1,3), c(3,1))] <- rf.hat[triple[1],triple[2]]+ rf.hat[triple[2],triple[3]]- 2*rf.hat[triple[1],triple[2]]*rf.hat[triple[2],triple[3]] n.ind <- sum(gen_N(subobj, triple)) V <- try(gen_cov_v2(subobj, triple,theta.rest)/n.ind, silent=TRUE) # If triplet of founders includes a combination where the rf is inestimable if (class(V)=="try-error") next g <- rf.hat[triple[1],triple[3]]-rf.hat[triple[1],triple[2]]-rf.hat[triple[2],triple[3]]+2*rf.hat[triple[1],triple[2]]*rf.hat[triple[2],triple[3]] test.stat <- g * (t(G) %*% V %*% G)^{-1} * g p.value <- pchisq(test.stat, 1, lower.tail=FALSE) p.values[j] <- p.value vuong[[j]] <- VuongTest(subobj, triple) } central.marker <- unlist(subobj$map)[triplets[,2]] mat <- data.frame(MarkerPos=central.marker, OSS=supscore, vuongp=unlist(lapply(vuong, function(x) x[4])), pval=p.values) ############ Generate plots from results library(ggplot2) ## Figure 3A dat <- data.frame(Position=central.marker, MMU=-log10(p.values)) col <- rep("Certain", nrow(dat)) col[p.values < .05/111] <- "Uncertain" col[p.values < 1e-20] <- "Very Uncertain" dat$cat <- factor(col, levels=c("Certain", "Uncertain", "Very Uncertain")) dat2 <- data.frame(Position=central.marker, MMU=loess(-log10(p.values)~central.marker, span=.15)$fitted, cat=factor(rep("Very Uncertain", 111))) jpeg("Fig3A.jpg", height=5, width=5, units="in", res=600) p <- ggplot(dat, aes(x=Position, y=MMU, col=cat))+geom_point(size=3)+theme_bw()+geom_line(data=dat2) p <- p + labs(colour="Uncertainty")+xlab("Genetic Map (cM)")+theme(legend.position=c(.8, .8)) p <- p+scale_colour_manual(values=c("#a6d96a", "#fdae61", "#d7191c")) p dev.off() ## Also want to plot the p-value against the distance to closest marker or average distance mindist <- vector(length=length(poss)) for (i in 1:n.trip) mindist[i] <- min(diff(obj$map[[1]][i+0:2])) avgdist <- vector(length=length(poss)) for (i in 1:n.trip) avgdist[i] <- mean(diff(obj$map[[1]][i+0:2])) dat <- data.frame(MinDist=avgdist, MMU=-log10(p.values)) dat2 <- data.frame(MinDist=avgdist, MMU=loess(-log10(p.values)~avgdist, span=.55)$fitted, cat=factor(rep("Very Uncertain", 111))) col <- rep("Certain", nrow(dat)) col[poss<.05/111 & p.values< .05/111] <- "Uncertain MMU & OSS" col[poss>.05/111 & p.values < .05/111] <- "Uncertain MMU" col[poss<.05/111 & p.values > .05/111] <- "Uncertain OSS" dat$cat <- factor(col, levels=c("Certain", "Uncertain MMU", "Uncertain OSS", "Uncertain MMU & OSS")) dat$avgdist <- avgdist jpeg("Fig3B.jpg", height=5, width=5, units="in", res=600) p <- ggplot(dat, aes(x=MinDist, y=MMU, col=cat))+geom_point(size=3)+theme_bw()+scale_colour_manual(values=c("#a6d96a", "#fdae61", "#6baed6", "#d7191c")) p <- p + labs(colour="Uncertainty")+xlab("Average distance between markers (cM)")+theme(legend.position=c(.7, .8)) p+scale_x_log10() dev.off() ### Read in data from TableS2.xls seq3b <- list() for (i in 1:4) seq3b[[i]] <- read.xls("TableS2.xls", sheet=i) pos <- do.call("rbind", lapply(seq3b, function(x) x[,c(1,3)])) pos <- pos[!duplicated(pos[,1]),] mrk <- intersect(pos[,1], colnames(mp$finals)) index <- sapply(mp$map[[1]][match(mrk, names(mp$map[[1]]))], function(x) which.min(abs(central.marker-x))) dat <- data.frame(PhysMap=pos[match(mrk, pos[,1]),2], GenMap=mp$map[[1]][match(mrk, names(mp$map[[1]]))], Signif=-log10(p.values[index])) SigCat <- vector(length=nrow(dat)) SigCat[dat$Signif < -log10(.05/111)] <- 1 SigCat[dat$Signif > -log10(.05/111) & dat$Signif < 20] <- 2 SigCat[dat$Signif > 20] <- 3 ## Need to mark markers where genetic and physical positions do not match up. index <- sapply(c(64656, 19180, 40595, 497469, 19982, 17082, 491667, 108114, 16378, 19778, 1379957, 444579), function(x) grep(x, rownames(dat))) incon <- rep(0, nrow(dat)) incon[index] <- 1 dat$Inconsistency <- factor(incon) dat$Uncertainty <- factor(SigCat) dat2 <- dat[!is.na(dat[,3]),] jpeg("Fig4.jpg", height=5, width=5, units="in", res=600) p <- ggplot(dat2, aes(y=PhysMap, x=GenMap, col=Uncertainty, shape=Inconsistency))+theme_bw()+geom_point(size=3)+scale_colour_manual(values=c("#a6d96a", "#fdae61", "#d7191c"), labels=c("MMU<5", "520"))+scale_shape_manual(values=c(19, 8)) p <- p+theme(legend.position=c(.8, .6))+scale_x_continuous("Genetic Map (cM)") + scale_y_continuous("Physical Map (bp)") p <- p+guides(shape=F) p dev.off()