#Article title ############## #High-quality, genome-wide SNP genotypic data for pedigreed germplasm of the diploid outbreeding species apple, #peach, and sweet cherry through a common workflow #Authors ######## #Stijn Vanderzande, Nicholas P Howard, Lichun Cai, Cassia Da Silva Linge, Laima Antanaviciute, Marco CAM Bink, #Johannes W Kruisselbrink, Nahla Bassil, Ksenija Gasic, Amy Iezzoni, Eric Van de Weg, Cameron Peace #S3 File ######## #R-scripts used to confirm and deduce P(P)C relationships. library(stringr) library(svMisc) ##Functions AdjustMV <- function(GTData){ for (i in 1:length(MissGT)){ GTData[,4:ncol(GTData)] <- lapply(GTData[,4:ncol(GTData)], gsub, pattern=MissGT[i], replacement = "--") } for (i in 1:length(MissAllele)){ GTData[,4:ncol(GTData)] <- lapply(GTData[,4:ncol(GTData)], gsub, pattern=MissAllele[i], replacement = "-") } return(GTData) } CheckParErr <- function(GenotypeIndPar){ Par1Err <- 0 if(str_count(GenotypeIndPar[1],"-") == 0){ AlleleCount <- sapply(AlleleList, str_count, string=GenotypeIndPar[1]) AlleleCount_par1 <- sapply(AlleleList, str_count, string=GenotypeIndPar[2]) if(!any(subset(AlleleCount_par1, names(AlleleCount_par1) %in% names(subset(AlleleCount, AlleleCount>0))) > 0, na.rm = FALSE) && str_count(GenotypeIndPar[2],"-") == 0){ Par1Err <- 1 } } return(c(Par1Err)) } CheckParParErr <- function(GenotypeIndParPar){ Par1Err <- 0 Par2Err <- 0 ParParErr <- 0 if(str_count(GenotypeIndParPar[1],"-") == 0){ AlleleCount <- sapply(AlleleList, str_count, string=GenotypeIndParPar[1]) AlleleCount_par1 <- sapply(AlleleList, str_count, string=GenotypeIndParPar[2]) AlleleCount_par2 <- sapply(AlleleList, str_count, string=GenotypeIndParPar[3]) AlleleCount_parComb <- AlleleCount_par1 + AlleleCount_par2 if(!any(subset(AlleleCount_par1, names(AlleleCount_par1) %in% names(subset(AlleleCount, AlleleCount>0))) > 0, na.rm = FALSE) && str_count(GenotypeIndParPar[2],"-") == 0){ Par1Err <-1 ParParErr <- 1 } if(!any(subset(AlleleCount_par2, names(AlleleCount_par2) %in% names(subset(AlleleCount, AlleleCount>0))) > 0, na.rm = FALSE) && str_count(GenotypeIndParPar[3],"-") == 0){ Par2Err <-1 ParParErr <- 1 } if(!all(subset(AlleleCount_parComb, names(AlleleCount_parComb) %in% names(subset(AlleleCount, AlleleCount>0))) > 0, na.rm = FALSE) && str_count(GenotypeIndParPar[2],"-") == 0 && str_count(GenotypeIndParPar[3],"-") == 0 && ParParErr == 0){ ParParErr <- 1 } } return(c(Par1Err, Par2Err, ParParErr)) } CheckPar <- function(GenotypesIndParPar){ if(nrow(GenotypesIndParPar) == 1){ return( c("-","-","-") ) } else if (nrow(GenotypesIndParPar) == 2){ ParErrors <- sum( sapply(GenotypesIndParPar[,4:ncol(GenotypesIndParPar)], CheckParErr) ) if (GenotypesIndParPar$Parent1[1] == GenotypesIndParPar$Ind[2]){ return( c(ParErrors[1] ,"-","-") ) } else if (GenotypesIndParPar$Parent2[1] == GenotypesIndParPar$Ind[2]){ return( c("-",ParErrors[1], "-") ) } else { return( c(ParErrors[1] ,"-","-") ) } } else if (nrow(GenotypesIndParPar) == 3){ ParErrors <- rowSums( sapply(GenotypesIndParPar[,4:ncol(GenotypesIndParPar)], CheckParParErr) ) return( c(ParErrors[1], ParErrors[2], ParErrors[3])) } else { return( c(GenotypesIndParPar$Ind,"More than 1 genotype per parent found","-","-")) } } FindParGT <- function(IndName, Genotypes){ ParGT <- rbind ( Genotypes[Genotypes$Ind == IndName,], Genotypes[Genotypes$Ind == Genotypes[Genotypes$Ind == IndName,]$Parent1,], Genotypes[Genotypes$Ind == Genotypes[Genotypes$Ind == IndName,]$Parent2,]) return(ParGT) } CheckParAll <- function(IndToCheck, ChrReport=NULL){ IndToCheck <- AdjustMV(IndToCheck) if(!is.null(ChrReport)){ } else { Results <- matrix(, nrow = 0, ncol = 3) for (i in 1:nrow(IndToCheck)){ progress(i, max.value = nrow(IndToCheck)) ParToCheck <- FindParGT(IndToCheck$Ind[i],IndToCheck) CheckResults <- CheckPar(ParToCheck) Results <- rbind(Results, CheckResults) } } rownames(Results) <- IndToCheck$Ind colnames(Results) <- c("Par1Err", "Par2Err", "ParParErr") Results <- cbind(IndToCheck$Ind, IndToCheck$Parent1,IndToCheck$Parent2,Results) colnames(Results) <- c("Ind","Par1", "Par2", "Par1Err", "Par2Err", "ParParErr") return(Results) } FindPosPar <- function(IndName, GTData, treshold){ IndGT <- subset(GTData, GTData$Ind == IndName) GTData <- subset(GTData, GTData$Parent1 != IndName) GTData <- subset(GTData, GTData$Parent2 != IndName) GTData <- subset(GTData, GTData$Ind != IndName) if (IndGT$Parent1 != "-"){ GTData <- subset(GTData, GTData$Ind != IndGT$Parent1) } if (IndGT$Parent2 != "-"){ GTData <- subset(GTData, GTData$Ind != IndGT$Parent2) } PossiblePar <- as.data.frame(matrix(,nrow=0,ncol=2)) colnames(PossiblePar) <- c("Ind","PosPar") if (nrow(GTData) > 0){ for (i in 1:nrow(GTData)){ ParErr <- CheckPar(rbind(IndGT,GTData[i,])) if (as.numeric(ParErr[1]) <= treshold){ PosPar <- as.data.frame(t(c(IndName, GTData$Ind[i])), stringsAsFactors = F) colnames(PosPar) <- c("Ind","PosPar") PossiblePar <- rbind(PossiblePar, PosPar) } } } return(PossiblePar) } FindPosParComb <- function(GTData, tresholdPE=0, tresholdPPE=0){ GTData <- AdjustMV(GTData) Par1Miss <- subset(GTData, GTData$Parent1 == "-") Par1Miss <- subset(Par1Miss, Par1Miss$Parent2 != "-") Par2Miss <- subset(GTData, GTData$Parent2 == "-") Par2Miss <- subset(Par2Miss, Par2Miss$Parent1 != "-") BothParMiss <- subset(GTData, GTData$Parent1 == "-") BothParMiss <- subset(BothParMiss, BothParMiss$Parent2 == "-") PosParComb <- as.data.frame(matrix(,nrow=0,ncol=2)) colnames(PosParComb) <- c("Ind","PosCross") PosPar <- as.data.frame(matrix(,nrow=0,ncol=2)) colnames(PosPar) <- c("Ind","PosPar") print("Starting step 1 of 3") if(nrow(Par1Miss) > 0){ for (i in 1:nrow(Par1Miss)){ progress(i, max.value = nrow(Par1Miss)) PosPar1List <- FindPosPar(Par1Miss$Ind[i],GTData, tresholdPE) if (nrow(PosPar1List) > 0){ PosPar <- rbind (PosPar, PosPar1List) for (j in 1: nrow(PosPar1List)){ TestParComb <- rbind(Par1Miss[i,], GTData[GTData$Ind == PosPar1List$PosPar[j],], GTData[GTData$Ind == Par1Miss$Parent2[i],]) ParParErr <- CheckPar(TestParComb) if (as.numeric(ParParErr [3]) <= tresholdPPE){ PosParC <- as.data.frame(t(c(Par1Miss$Ind[i],paste(PosPar1List$PosPar[j]," x ", Par1Miss$Parent2[i], sep="" ))), stringsAsFactors =F) colnames(PosParC) <- c("Ind","PosCross") PosParComb <- rbind (PosParComb, PosParC) } } } } } print("Starting step 2 of 3") if(nrow(Par2Miss) > 0){ for (i in 1:nrow(Par2Miss)){ progress(i, max.value = nrow(Par2Miss)) PosPar2List <- FindPosPar(Par2Miss$Ind[i],GTData, tresholdPE) if (nrow(PosPar2List) > 0){ PosPar <- rbind (PosPar, PosPar2List) for (j in 1: nrow(PosPar2List)){ TestParComb <- rbind(Par2Miss[i,], GTData[GTData$Ind == PosPar2List$PosPar[j],], GTData[GTData$Ind == Par2Miss$Parent1[i],]) ParParErr <- CheckPar(TestParComb) if (as.numeric(ParParErr [3]) <= tresholdPPE){ PosParC <- as.data.frame(t(c(Par2Miss$Ind[i], paste(Par2Miss$Parent1[i]," x ", PosPar2List$PosPar[j], sep="" ))), stringsAsFactors =F) colnames(PosParC) <- c("Ind","PosCross") PosParComb <- rbind (PosParComb, PosParC) } } } } } print("Starting step 3 of 3") if(nrow(BothParMiss) > 0){ for (i in 1:nrow(BothParMiss)){ progress(i, max.value = nrow(BothParMiss)) PosParList <- FindPosPar(BothParMiss$Ind[i],GTData, tresholdPE) if (nrow(PosParList) > 0){ PosPar <- rbind (PosPar, PosParList) for (j in 1: nrow(PosParList)){ for (k in j: nrow(PosParList)){ TestParComb <- rbind(BothParMiss[i,], GTData[GTData$Ind == PosParList$PosPar[j],], GTData[GTData$Ind == PosParList$PosPar[k],]) ParParErr <- CheckPar(TestParComb) if (as.numeric(ParParErr [3]) <= tresholdPPE){ PosParC <- as.data.fram(t(c(BothParMiss$Ind[i], paste(PosParList$PosPar[j]," x ", PosParList$PosPar[k], sep="" ))), stringsAsFactors =F) colnames(PosParC) <- c("Ind","PosCross") PosParComb <- rbind (PosParComb, PosParC) } } } } } } return(list(PossibleParents = PosPar, PossibleCrosses = PosParComb)) } ## Load data files GenotypeData <- as.data.frame(read.csv(file="6+9kv5_R.csv", head=T, sep=",", stringsAsFactors=F)) ###Mapdata <- as.data.frame(read.csv(file="SNPdata.csv", head=T, sep=",", stringsAsFactors=F)) ## Define Alleles and Missing Values AlleleList <- c("A","B","C","null", "G", "T") MissGT <- c("NC","00", "--") MissAllele <-c("N","0","-") ## Check Parentages ParentCheckResults <- CheckParAll(GenotypeData) write.csv(ParentCheckResults, file = "Parent Check Results2.csv", row.names = F) ## FindPosParComb resultsParentSearch<- FindPosParComb(GenotypeData, 25, 50)