#### PART 1: set up covariance matrix ################################## # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Haplo2Geno # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # CALL: Haplo2Geno( inpMat ) # ARGUMENTS: inpMat: [MATRIX] Haplotype matrix. # Use of Rcpp to speed-up code... Haplo2Geno = function( inpMat ) { n = dim( inpMat )[1] p = dim( inpMat )[2] outMat = matrix( NA, nrow = n/2, ncol = p) for( i in 1:(n/2) ) outMat[ i, ] = inpMat[ (2*i - 1), ] + inpMat[ 2*i, ] colnames(outMat) = colnames(inpMat) # Retain original column names return( outMat ) } # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Expectation matrices # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Read each entry of the genotyped matrices for the Sires and apply the following replacement rule: # 1. A is coded as 1, then 2 (AA) -> 0.5, 1 (AB, BA) -> 0, 0 (BB) -> -0.5, or # 2. A is coded as 0, then 0 (AA) -> 0.5, 1 (AB, BA) -> 0, 2 (BB) -> -0.5 # CALL: ExpectMat( inMat ) # ARGUMENT(S): 1. inMat: [MATRIX, VECTOR] The paternal Genotype matrix or vector. # 2. A_coding: [0, 1] Coding of allele A ExpectMat = function( inMat, A_coding = 1 ) { if(A_coding == 1) { rule = function( index ) ifelse( index == 2 , 0.5, ifelse( index == 0, -0.5, 0 ) ) } else rule = function( index ) ifelse( index == 0 , 0.5, ifelse( index == 2, -0.5, 0 ) ) if(class(inMat) == "matrix") ExP.Fa = apply( inMat, 2, rule ) if(class(inMat) == "numeric") ExP.Fa = sapply( inMat, rule ) return( ExP.Fa ) } # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # LD matrices # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # === === === CALCULATE THE LINKAGE DISEQUILIBRIUM MATRIX FOR THE DAMS === === === # CALL: LDdam( inMat ) # ARGUMENT(S): 1. inMat: [MATRIX] The maternal HAPLOTYPE matrix. # 2. pos_chr: [LIST] The positions of markers on chromosomes. # The function generates a block diagonal sparse matrix based on Matrix::bdiag. # Use as.matrix() to obtain a regular one. # To do list: # 1. Progress bar would be nice :) LDdam = function( inMat, pos_chr ){ # Check and if necesary install the Matrix package if ( !require( "Matrix" ) ) install.packages( "Matrix", dependencies = TRUE ) length_chr = vector( mode = "list", length = length( pos_chr ) ) inMatByChr = vector( mode = "list", length = length( pos_chr ) ) Dd = vector( mode = "list", length = length( pos_chr ) ) for( nc in seq_along( pos_chr ) ) { length_chr[[ nc ]] = length( pos_chr[[ nc ]] ) # Split inMat by chromosome if( nc == 1 ) inMatByChr[[ nc ]] = inMat[ , 1:sum( unlist( length_chr[ 1:nc ] ) ) ] else inMatByChr[[ nc ]] = inMat[ , ( sum( unlist( length_chr[ 1:( nc - 1 ) ] ) ) + 1 ):sum( unlist( length_chr[ 1:nc ] ) ) ] n = dim( inMatByChr[[ nc ]] )[1] p = dim( inMatByChr[[ nc ]] )[2] haplo.freq = array( numeric(), dim = c( 2, 2, p, p ) ) Dd[[ nc ]] = matrix( NA, nrow = p, ncol = p ) for( j in 1:p ) { for( k in j:p ) { # Calculates the upper triangular part plus diagonal zz = sum( !inMatByChr[[ nc ]][ , j ] & !inMatByChr[[ nc ]][ , k ] ) # zero-zero oz = sum( inMatByChr[[ nc ]][ , j ] & !inMatByChr[[ nc ]][ , k ] ) # one-zero zo = sum( !inMatByChr[[ nc ]][ , j ] & inMatByChr[[ nc ]][ , k ] ) # zero-one oo = sum( inMatByChr[[ nc ]][ , j ] & inMatByChr[[ nc ]][ , k ] ) # one-one haplo.freq[ , , j, k ] = 1/n * matrix( c( zz, oz, zo, oo ), ncol = 2 ) Dd[[ nc ]][ j, k ] = det( haplo.freq[ , , j, k ] ) } } Dd[[ nc ]] = Matrix::forceSymmetric( Dd[[ nc ]] ) # The whole matrix } return( Reduce( Matrix::bdiag, Dd ) ) # Creates a block diagonal sparse matrix } # === === === LINKAGE DISEQUILIBRIUM FOR THE SIRES === === === # ARGUMENT(S): 1. inMat: [MATRIX] Haplotype matrix for sires for all chromosomes. # 2. pos_chr: [LIST] The positions of markers on chromosomes. # 3. family: [VECTOR] Which family (sire) should be processed? # Vector with consecutive entries of the form 1:2, 3:4, 5:6 and so on, # but not as 2:3, 4:5 and so on. # 4. map_fun: [haldane,kosambi] The mapping function applied. # The function generates a block diagonal sparse matrix based on Matrix::bdiag. # Use as.matrix() to obtain a regular one. LDsire = function( inMat, pos_chr, family, map_fun = "haldane" ) { # Check and if necesary install the Matrix package if( !require( "Matrix" ) ) install.packages( "Matrix", dependencies = TRUE ) # Mapping functions # 1. Haldane if( map_fun == "haldane" ) theta = function( pos_chr, nc, j, k ) { 0.5 * ( 1 - exp( -2 * ( abs( pos_chr[[ nc ]][k] - pos_chr[[ nc ]][j] ) ) ) ) } # 2. Kosambi if( map_fun == "kosambi" ) theta = function( pos_chr, nc, j, k ) { 0.5 * tanh( 2 * ( abs( pos_chr[[ nc ]][k] - pos_chr[[ nc ]][j] ) ) ) } if( length( family ) != 2 ) stop( "Number of rows not correct." ) inMat = inMat[ family, ] length_chr = vector( mode = "list", length = length( pos_chr ) ) inMatByChr = vector( mode = "list", length = length( pos_chr ) ) Ds = vector( mode = "list", length = length( pos_chr ) ) for( nc in seq_along( pos_chr ) ) { length_chr[[ nc ]] = length( pos_chr[[ nc ]] ) # Split inMat by chromosome if( nc == 1 ) inMatByChr[[ nc ]] = inMat[ , 1:length( pos_chr[[ nc ]] ) ] else inMatByChr[[ nc ]] = inMat[ , ( sum( unlist( length_chr[ 1:( nc - 1 ) ] ) ) + 1 ):sum( unlist( length_chr[ 1:nc ] ) ) ] p = dim( inMatByChr[[ nc ]] )[2] Ds[[ nc ]] = matrix( NA, nrow = p, ncol = p ) for( j in 1:p ) { for( k in j:p ) { # Calculates the upper triangular part plus diagonal if( sum( inMatByChr[[ nc ]][ , j ] ) == 1 & sum( inMatByChr[[ nc ]][ , k ] ) == 1 ) { if( sum( inMatByChr[[ nc ]][ 1, j ], inMatByChr[[ nc ]][ 1, k ] ) == 1 ) Ds[[ nc ]][ j, k ] = -0.25 * ( 1 - 2 * theta( pos_chr, nc, j, k ) ) else Ds[[ nc ]][ j, k ] = 0.25 * ( 1 - 2 * theta( pos_chr, nc, j, k ) ) } else Ds[[ nc ]][ j, k ] = 0 } } Ds[[ nc ]] = Matrix::forceSymmetric( Ds[[ nc ]] ) # The whole matrix } return( Reduce( Matrix::bdiag, Ds ) ) # Creates a block diagonal sparse matrix } # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # Covariance matrices # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # CALL: CovarMatrix( exp_freq_mat, LDDam, LDSire, Ns ) # ARGUMENTS: exp_freq_mat: [LIST] A list of EXPECTATION matrices usually by experiment. # LDDam: [LIST] A list of Linkage disequilibrium matrices for damns usually by experiment. # LDSire: [LIST] A list of all Linkage disequilibrium matrices for the sires usually by experiment. # Each element of the list corresponds to a family. # Ns: [VECTOR] The Family size for each sire s. # REMARK: The internal suMM function works on lists! CovarMatrix = function( exp_freq_mat, LDDam, LDSire, Ns ) { suMM = function( InsertList ) Reduce( "+", Map( "*", InsertList, Ns/sum( Ns ) ) ) # --- --- --- Process the EXPECTATION matrices --- --- --- e1 = list() for( i in 1:dim( exp_freq_mat )[1] ) { e1[[i]] = exp_freq_mat[ i, ] } e2 = mclapply( e1, tcrossprod ) # --- --- --- Compute the COVARIANCE matrix --- --- --- sm = suMM( e1 ) covK = LDDam + suMM( LDSire ) + suMM( e2 ) - tcrossprod( sm ) return( covK ) } #### PART 2: collection of functions ################################## # variance of beta_k, threshold and degrees of freedom ratio <- function(lambda, eigendec, n, weights = 1, alpha = 0.01){ p <- length(eigendec$values) if (length(weights) < p) weights <- rep(weights[1], p) V <- eigendec$vectors df.mod <- sum(eigendec$values / (eigendec$values + lambda / n)) df.res <- sum(eigendec$values * (eigendec$values + 2 * lambda / n) / (eigendec$values + lambda / n) ^2) threshold <- vb <- NA if(df.res >= n) { cat("WARNING: df.res", df.res, '(> n) \n') } else{ threshold <- qnorm(1 - alpha / 2) vb <- sapply(1:p, function(k) {1 / weights[k] * sum(V[k, ]^2 * eigendec$values / (eigendec$values + lambda / n) ^ 2)}) # up to a constant sigmaE2 / n } out <- list(df = df.res, var.beta = vb, threshold = threshold) return(out) } # expectation of beta_k coeff.beta.k <- function(k, beta.true, lambda, eigendec, n, weights = 1){ p <- length(eigendec$values) if (length(weights) < p) weights <- rep(weights[1], p) V <- eigendec$vectors d <- eigendec$values / (eigendec$values + lambda / n) z <- c() for(i in 1:length(d)) { z[i] <- sum(d * V[i, ] * V[k, ]) # coefficient for beta_i, then E(beta_k)=sum(coeff_(i,k)*beta_i) } return(1 / sqrt(weights[k]) * sum( z * sqrt(weights) * beta.true)) } # sample QTL positions position.qtl <- function(p, nqtl, nchr = 1){ p <- round(p / nchr) pos <- list(sample(1:round(quantile(1:p, 0.25)), nqtl), # beginning of 1. chromosome sample(round(quantile(1:p, 0.25)):round(quantile(1:p, 0.75)), nqtl), # middle sample(round(quantile(1:p, 0.75)):p, nqtl), # end sample(1:p, nqtl)) # arbitrary position names(pos) <- c('beginning', 'middle', 'end', 'arbitrary') return(pos) } # t-test single-locus approach pwr.ttest1 <- function(n, x, alpha = 0.01){ quant <- qt(1 - alpha / 2, n - 1) pt(quant, df = n - 1, ncp = x * sqrt(n), lower.tail = F) + pt(-quant, df = n - 1, ncp = x * sqrt(n)) } # minimum n so that df.res < n start.value <- function(lambda, R, nfam, weights = 1, alpha = 0.01){ p <- nrow(R) if (length(weights) < p) weights <- rep(weights[1], p) if (sum(weights == 1) != p ){ R <- t(apply(R, 1, function(x) {x / sqrt(weights)})) R <- apply(R, 2, function(x) {x / sqrt(weights)}) } # -> leads to "near-symmetric" matrix e <- eigen(R) repeat{ res <- ratio(lambda, e, nfam, weights, alpha) if(!is.na(res$threshold)) break nfam <- nfam + 10 if(nfam == 1000) { print('ERROR: no start value obtained') break } } return(nfam) } # adapt simpleM to consider theoretical covariance instead of composite LD values simpleM <- function(mat){ e <- eigen(mat, only.values = T)$values s <- sum(e) p <- length(e) ss <- sapply(1:p, function(x) {sum(e[1:x])/ s}) m <- min(which(ss >= 0.995)) return(m) } # at least one significant SNP in window around QTL = 1 TP (<= nqtl) fpr <- function(sign.snp, snp.pos, qtl.pos, win = 1e-2){ # window around true QTL in map distances (e.g. cM) len <- length(snp.pos) pos.a <- qtl.pos - win pos.e <- qtl.pos + win qtl.map <- data.frame(qtl.pos, pos.a, pos.e) # SNPs being next to causative variants qtlsnp <- found.qtl <- c() for (l in 1:length(qtl.pos)) { interval <- which((snp.pos >= qtl.map$pos.a[l]) & (snp.pos <= qtl.map$pos.e[l])) qtlsnp <- c(qtlsnp, interval) found.qtl[l] <- any(sign.snp %in% interval) } tp <- sum(found.qtl) fp <- sum(!(sign.snp %in% qtlsnp)) fn <- sum(!found.qtl) tn <- sum(setdiff(1:len, sign.snp) %in% setdiff(1:len, qtlsnp)) return(list(sens = tp / (tp + fn), spec = tn / (fp + tn), tp = tp, fp = fp, tn = tn, fn = fn, found.qtl = found.qtl)) } recode.loci <- function(pop){ X <- pullSegSiteGeno(pop) freqs <- colSums(X) / (2 * nrow(X)) rec <- which(freqs < 0.5) return(rec) } recode.anything <- function(mat, loci, geno = F){ if(!geno){ # haplotypes for(l in loci){ eins <- mat[, l] == 1 mat[mat[, l] == 0, l] <- 1 mat[eins, l] <- 0 } } else{ # genotypes for(l in loci){ zwei <- mat[, l] == 2 mat[mat[, l] == 0, l] <- 2 mat[zwei, l] <- 0 } } return(mat) } #### PART 3: sample size calculation ################################## # determine optimal sample size at given power pwr.normtest <- function(R, n, betaSE, lambda, pos, weights = 1, alpha = 0.01){ p <- nrow(R) if (length(weights) < p) weights <- rep(weights[1], p) if (sum(weights == 1) != p ){ R <- t(apply(R, 1, function(x) {x / sqrt(weights)})) R <- apply(R, 2, function(x) {x / sqrt(weights)}) } # -> leads to "near-symmetric" matrix e <- eigen(R) # e <- eigen(diag(1 / sqrt(weights)) %*% R %*% diag(1 / sqrt(weights))) beta.true <- rep(0, p); beta.true[pos] <- betaSE x <- t(beta.true) %*% R %*% beta.true h2.ld <- x / (1 + x) h2.le <- sum(beta.true^2) / (1 + sum(beta.true^2)) out <- ratio(lambda, e, n, weights, alpha) eb <- sapply(1:p, function(k) {coeff.beta.k(k, beta.true, lambda, e, n, weights)}) term <- eb / sqrt(out$var.beta) * sqrt(n) pwr <- pnorm(out$threshold - term, lower.tail = F) + pnorm(-out$threshold - term) result <- mean(pwr[pos]) # OR: min(pwr[pos]) return(c(result, h2.le, h2.ld)) } search.best.n.bisection <- function(R, betaSE, lambda, pos, nstart, nmax, typeII, alpha = 0.01){ y <- c(); k <- 0 n1 <- nstart n3 <- nmax n2 <- ceiling(mean(c(n1, n3))) y[n1] <- pwr.normtest(R, n1, betaSE, lambda, pos, alpha = alpha)[1] y[n2] <- pwr.normtest(R, n2, betaSE, lambda, pos, alpha = alpha)[1] y[n3] <- pwr.normtest(R, n3, betaSE, lambda, pos, alpha = alpha)[1] repeat{ if(y[n3] < 1 - typeII) { cat('WARNING: power', 1- typeII, 'can not be achieved. Increase nmax. \n') return(n3) break } if(all(y[c(n1, n2, n3)] >= 1 - typeII)){ return(n1) break } else if(all(y[c(n2, n3)] >= 1 - typeII)){ n3 <- n2 n2 <- ceiling(mean(c(n1, n3))) y[n2] <- pwr.normtest(R, n2, betaSE, lambda, pos, alpha = alpha)[1] } else if(y[n3] >= 1 - typeII){ n1 <- n2 n2 <- ceiling(mean(c(n1, n3))) y[n2] <- pwr.normtest(R, n2, betaSE, lambda, pos, alpha = alpha)[1] } k <- k + 1 if(k > log2(nmax - nstart)) { return(n2) break } } } #### PART 4: data simulation ################################## # simulate recent population and determine correlation matrices between markers sim_data <- function(wdh = '', dir = '', recode = F){ founder.file <- file.path(dir, paste0('popE_', wdh, '.RData')) ### simulate founder population ######################################## u <- 0 repeat{ founderPop <- runMacs2(nInd = 2 * nmothers.max, nChr = 1, segSites = p, Ne = 100, bp = 1e+08, genLen = 0.01, mutRate = 2.5e-08) assign("SP", SimParam$new(founderPop), envir = globalenv()) # SP <- SimParam$new(founderPop) SP$setGender("yes_sys") SP$addTraitA(nQtlPerChr = 1) SP$addTraitA(nQtlPerChr = 2) SP$addTraitA(nQtlPerChr = 3) SP$addTraitA(nQtlPerChr = 4) SP$addTraitA(nQtlPerChr = 5) SP$setVarE(h2 = rep(h2, SP$nTraits)) for(l in 1:SP$nTraits){ trait <- SP$traits[[l]] # Change additive effects to ones trait@addEff <- rep(1, trait@nLoci) # Change intercept to zero trait@intercept <- 0 # Replace trait at position l SP$switchTrait(trait, l) } SP$rescaleTraits(mean = rep(0, SP$nTraits), var = rep(1, SP$nTraits), varEnv = rep(0, SP$nTraits), varGxE = rep(1e-06, SP$nTraits)) pop <- newPop(founderPop) if(all(is.finite(varA(pop)))) break u <- u + 1 if (u == 10){ cat('ERROR no meaningful simulation in repetition', wdh, '\n') stop } } map.snp <- founderPop@genMap[[1]] rec <- recode.loci(pop) save(list = c('SP', 'founderPop', 'pop', 'rec'), file = founder.file) my_pop <- selectCross(pop = pop, nFemale = nmothers.max, nMale = nfathers.max, use = "pheno", nCrosses = nmothers.max, trait = 1) # maternal haplotypes of progeny H.mat <- pullSegSiteHaplo(my_pop, haplo = 1) H.mat <- recode.anything(H.mat, rec) # sire haplotypes sire <- pop@id[match(my_pop@father, pop@id)] rown <- paste(rep(unique(sire), each = 2), c(1, 2), sep = '_') H.sire <- pullSegSiteHaplo(pop)[rown, ] H.sire <- recode.anything(H.sire, rec) ### SET UP COVARIANCE MATRIX ######################################## for(N in c(1, 5, 10)){ ### prepare data for covariance matrix ### haploSire <- H.sire[1:(2 * N), ] haploMo <- H.mat XSire <- list(matrix(sapply(1:ncol(haploSire), function(k){haploSire[seq(1, 2 * N, 2), k] + haploSire[seq(2, 2 * N, 2), k]}), nrow = N, ncol = p)) pos_chr <- list(map.snp) ### calculate covariance matrix ### expectationMat <- matrix(mclapply( XSire, ExpectMat )[[1]], nrow = N, ncol = p) linkDam <- LDdam( inMat = haploMo, pos_chr ) fam <- vector( mode = "list", length = length( 1:N ) ) for( l in 1:N ) fam[[l]] <- ( 2*l - 1 ):( 2*l ) # Output is a list of lists. First level -> experiments, second level -> matrices for each family linkSire <- foreach( indexFam = 1:N ) %do% LDsire( inMat = haploSire, pos_chr, family = fam[[indexFam]], map_fun = "haldane" ) Ns <- rep( nfam, N ) CovMat <- CovarMatrix( expectationMat, linkDam, linkSire, Ns ) ### correlation matrix ### s <- diag(CovMat) id <- s > 1e-6 CovMat <- CovMat[id, id]; s <- s[id] R <- t(apply(CovMat, 1, function(x) {x / sqrt(s)})) R <- apply(R, 2, function(x) {x / sqrt(s)}) saveRDS(R, file = file.path(dir, paste0('Rmatrix_window_N', N, '_', wdh, '.rds'))) } } #### PART 5: data analysis ################################## # estimate marker effects in sample of recent population based on (1) single-locus regression or (2) multiple-locus regression (ridge) estimate <- function(pop, nfam, nfathers, nmothers.max, p, coltrait, rec){ n <- nfam * nfathers ## determine recent population my_pop <- selectCross(pop = pop, nFemale = nmothers.max, nMale = nfathers, use = "pheno", nCrosses = n, trait = coltrait) # family-wise centred genotype and phenotype matrix X <- pullSegSiteGeno(my_pop) X <- recode.anything(X, rec, geno = T) M <- apply(X, 2, function(y){tapply(y, my_pop@father, mean)}) if(nfathers == 1) { M <- matrix(M, nrow = nfathers, ncol = p) rownames(M) <- my_pop@father[1] } y <- pheno(my_pop)[, coltrait] Xc <- matrix(ncol = p, nrow = n) for(s in unique(my_pop@father)){ id <- which(my_pop@father == s) Xc[id, ] <- sapply(1:ncol(X), function(i){(X[id, i] - M[s, i])}) y[id] <- y[id] - mean(y[id]) } Xs <- apply(Xc, 2, function(x){if(sd(x) > 1e-6) x / sd(x) else x}) y <- y / sd(y) ## GWAS Meff <- simpleM(t(Xs) %*% Xs) ### GWAS accounts for confounding by pedigree ANIMAL <- factor(1:n + nfathers) dadid <- factor(rep(1:nfathers, each = nfam)) momid <- rep(NA, length(ANIMAL)) pedigree <- data.frame(ANIMAL, dadid, momid) ainv <- asreml.Ainverse(pedigree)$ginv pval.gwas <- apply(Xs, 2, function(z){ data <- data.frame(y, z) data$id <- ANIMAL mod <- asreml(fixed = y ~ z, random = ~ ped(id, var = T, init = 1), data = data, ginverse = list(id = ainv), na.method.X = "omit", na.method.Y = "omit", control = asreml.control(trace = FALSE)) beta.SE <- sqrt(mod$vcoef$fixed[1] * mod$sigma2) if(beta.SE > 1e-6){ pval <- 2 * pnorm(abs(mod$coefficients$fixed[1]) / beta.SE, lower.tail = F) } else pval <- 1 return(pval) }) ## SNP-BLUP approach data <- data.frame(y, Xs) ridge <- asreml(fixed = y ~ 1, random = ~ grp(geno), group = list(geno = 2:ncol(data)), data = data, control = asreml.control(trace = F), na.method.X = "omit", na.method.Y = "omit", workspace = 1024e+06) ridge$Vu <- summary(ridge)$varcomp$component[1] ridge$u.var <- ridge$vcoef$random * ridge$sigma2 # var for u-uHat u.var <- ridge$Vu - ridge$u.var # var for uHat, Searle p. 272 id <- u.var > 1e-6 pval.ridge <- rep(1, p) pval.ridge[id] <- 2 * pnorm(abs(ridge$coefficients$random[id]) / sqrt(u.var[id]), lower.tail = F) list(Meff = Meff, pval.gwas = pval.gwas, pval.ridge = pval.ridge) }