#### #input file: two-column; raw genotype data; first row locus names; first column individual id #### #### define number of simulated samples and iterations no.sim.samples=15 n.its<-1000 ######### datasim<-read.table("eg_input.txt", header=T, sep="\t") reboot=datasim no.samples=dim(datasim)[1] noloci=(dim(datasim)[2]-1)/2 population=datasim[,-c(1)] L=ncol(population) locus_positions=(2*(unique(round((1:(L-2))/2)))+1) lnames=colnames(population) OUT=NULL for (x in locus_positions) { alleles=c(population[,x],population[,x+1]) alleles2=as.data.frame(table(alleles)) alleles3=alleles2[which(alleles2[,1]!=0),] alleles4=cbind(alleles3,alleles3[,2]/sum(alleles3[,2])) output=cbind(x,lnames[x],alleles4) OUT <<- rbind(OUT,output) } colnames(OUT) <- c("Number","Locus","allele","count","frequency") #add column headers allalleles=dim(OUT)[1] prop=data.frame(matrix(0,2,no.sim.samples)) prop[1,]=c(1:no.sim.samples) collect<-data.frame(matrix(0,n.its,no.sim.samples)) names(collect)<-1:no.sim.samples names(prop)<-1:no.sim.samples tempX=as.data.frame(matrix(1,no.sim.samples,noloci*2)) names(tempX)=names(population) for (d in 1:no.sim.samples){ print(d) for(i in 1:n.its){ for(W in 1:noloci){ tempX[1:d,(W+(W-1))]=as.character(sample(OUT[which(OUT$Number==(W+(W-1))),3],d,repl=TRUE,prob=OUT[which(OUT$Number==(W+(W-1))),5])) tempX[1:d,(W+(W-1)+1)]=as.character(sample(OUT[which(OUT$Number==(W+(W-1))),3],d,repl=TRUE,prob=OUT[which(OUT$Number==(W+(W-1))),5])) } loc1_5<-length(sort(unique(c(unique(tempX[,1][1:d]),unique(tempX[,2][1:d]))))) loc2_5<-length(sort(unique(c(unique(tempX[,3][1:d]),unique(tempX[,4][1:d]))))) loc3_5<-length(sort(unique(c(unique(tempX[,5][1:d]),unique(tempX[,6][1:d]))))) loc4_5<-length(sort(unique(c(unique(tempX[,7][1:d]),unique(tempX[,8][1:d]))))) loc5_5<-length(sort(unique(c(unique(tempX[,9][1:d]),unique(tempX[,10][1:d]))))) loc6_5<-length(sort(unique(c(unique(tempX[,11][1:d]),unique(tempX[,12][1:d]))))) loc7_5<-length(sort(unique(c(unique(tempX[,13][1:d]),unique(tempX[,14][1:d]))))) loc8_5<-length(sort(unique(c(unique(tempX[,15][1:d]),unique(tempX[,16][1:d]))))) loc9_5<-length(sort(unique(c(unique(tempX[,17][1:d]),unique(tempX[,18][1:d]))))) loc10_5<-length(sort(unique(c(unique(tempX[,19][1:d]),unique(tempX[,20][1:d]))))) loc11_5<-length(sort(unique(c(unique(tempX[,21][1:d]),unique(tempX[,22][1:d]))))) loc12_5<-length(sort(unique(c(unique(tempX[,23][1:d]),unique(tempX[,24][1:d]))))) loc13_5<-length(sort(unique(c(unique(tempX[,25][1:d]),unique(tempX[,26][1:d]))))) loc14_5<-length(sort(unique(c(unique(tempX[,27][1:d]),unique(tempX[,28][1:d]))))) loc15_5<-length(sort(unique(c(unique(tempX[,29][1:d]),unique(tempX[,30][1:d]))))) loc16_5<-length(sort(unique(c(unique(tempX[,31][1:d]),unique(tempX[,32][1:d]))))) loc17_5<-length(sort(unique(c(unique(tempX[,33][1:d]),unique(tempX[,34][1:d]))))) loc18_5<-length(sort(unique(c(unique(tempX[,35][1:d]),unique(tempX[,36][1:d]))))) loc19_5<-length(sort(unique(c(unique(tempX[,37][1:d]),unique(tempX[,38][1:d]))))) loc20_5<-length(sort(unique(c(unique(tempX[,39][1:d]),unique(tempX[,40][1:d]))))) no.alleles_5<-loc1_5+loc2_5+loc3_5+loc4_5+loc5_5+loc6_5+loc7_5+loc8_5+loc9_5+loc10_5+loc11_5+ +loc12_5+loc13_5+loc14_5+loc15_5+loc16_5+loc17_5+loc18_5+loc19_5+loc20_5 prop_samp<-no.alleles_5/allalleles collect[i,d]<-prop_samp } } for(z in 1:no.sim.samples){ prop[2,z]=length(collect[,z][collect[,z]==1])/n.its } results_allloci=(rbind(prop,collect)) write.table(results_allloci,"results_allloci_allelefrq.txt") plot(as.numeric(prop[2,])~as.numeric((prop[1,])),xlab="no. of individuals subsampled",ylab=("proportion of data sets capturing total no. of alleles")) abline(h=0.05,lty=2) ############### # remove freq <0.02 ########## population=datasim[,-c(1)] L=ncol(population) locus_positions=(2*(unique(round((1:(L-2))/2)))+1) lnames=colnames(population) OUT=NULL for (x in locus_positions) { alleles=c(population[,x],population[,x+1]) alleles2=as.data.frame(table(alleles)) alleles3=alleles2[which(alleles2[,1]!=0),] alleles4=cbind(alleles3,alleles3[,2]/sum(alleles3[,2])) output=cbind(x,lnames[x],alleles4) OUT <<- rbind(OUT,output) } colnames(OUT) <- c("Number","Locus","allele","count","frequency") OUT2=OUT[which(OUT$frequency<0.02),] nolociX=dim(OUT2)[1] for (i in 1:nolociX) { population=population[which(population[,OUT2$Number[i]]!= OUT2$allele[i]),] population=population[which(population[,OUT2$Number[i]+1]!= OUT2$allele[i]),] } datasimbb=population OUT3=NULL for (x in locus_positions) { alleles=c(datasimbb[,x],datasimbb[,x+1]) alleles2=as.data.frame(table(alleles)) alleles3=alleles2[which(alleles2[,1]!=0),] alleles4=cbind(alleles3,alleles3[,2]/sum(alleles3[,2])) output=cbind(x,lnames[x],alleles4) OUT3 <<- rbind(OUT3,output) } colnames(OUT3) <- c("Number","Locus","allele","count","frequency") allalleles=dim(OUT3)[1] prop=data.frame(matrix(0,2,no.sim.samples)) prop[1,]=c(1:no.sim.samples) collect<-data.frame(matrix(0,n.its,no.sim.samples)) names(collect)<-1:no.sim.samples names(prop)<-1:no.sim.samples tempX=as.data.frame(matrix(1,no.sim.samples,noloci*2)) names(tempX)=names(population) for (d in 1:no.sim.samples){ print(d) for(i in 1:n.its){ for(W in 1:noloci){ tempX[1:d,(W+(W-1))]=as.character(sample(OUT3[which(OUT3$Number==(W+(W-1))),3],d,repl=TRUE,prob=OUT3[which(OUT3$Number==(W+(W-1))),5])) tempX[1:d,(W+(W-1)+1)]=as.character(sample(OUT3[which(OUT3$Number==(W+(W-1))),3],d,repl=TRUE,prob=OUT3[which(OUT3$Number==(W+(W-1))),5])) } loc1_5<-length(sort(unique(c(unique(tempX[,1][1:d]),unique(tempX[,2][1:d]))))) loc2_5<-length(sort(unique(c(unique(tempX[,3][1:d]),unique(tempX[,4][1:d]))))) loc3_5<-length(sort(unique(c(unique(tempX[,5][1:d]),unique(tempX[,6][1:d]))))) loc4_5<-length(sort(unique(c(unique(tempX[,7][1:d]),unique(tempX[,8][1:d]))))) loc5_5<-length(sort(unique(c(unique(tempX[,9][1:d]),unique(tempX[,10][1:d]))))) loc6_5<-length(sort(unique(c(unique(tempX[,11][1:d]),unique(tempX[,12][1:d]))))) loc7_5<-length(sort(unique(c(unique(tempX[,13][1:d]),unique(tempX[,14][1:d]))))) loc8_5<-length(sort(unique(c(unique(tempX[,15][1:d]),unique(tempX[,16][1:d]))))) loc9_5<-length(sort(unique(c(unique(tempX[,17][1:d]),unique(tempX[,18][1:d]))))) loc10_5<-length(sort(unique(c(unique(tempX[,19][1:d]),unique(tempX[,20][1:d]))))) loc11_5<-length(sort(unique(c(unique(tempX[,21][1:d]),unique(tempX[,22][1:d]))))) loc12_5<-length(sort(unique(c(unique(tempX[,23][1:d]),unique(tempX[,24][1:d]))))) loc13_5<-length(sort(unique(c(unique(tempX[,25][1:d]),unique(tempX[,26][1:d]))))) loc14_5<-length(sort(unique(c(unique(tempX[,27][1:d]),unique(tempX[,28][1:d]))))) loc15_5<-length(sort(unique(c(unique(tempX[,29][1:d]),unique(tempX[,30][1:d]))))) loc16_5<-length(sort(unique(c(unique(tempX[,31][1:d]),unique(tempX[,32][1:d]))))) loc17_5<-length(sort(unique(c(unique(tempX[,33][1:d]),unique(tempX[,34][1:d]))))) loc18_5<-length(sort(unique(c(unique(tempX[,35][1:d]),unique(tempX[,36][1:d]))))) loc19_5<-length(sort(unique(c(unique(tempX[,37][1:d]),unique(tempX[,38][1:d]))))) loc20_5<-length(sort(unique(c(unique(tempX[,39][1:d]),unique(tempX[,40][1:d]))))) no.alleles_5<-loc1_5+loc2_5+loc3_5+loc4_5+loc5_5+loc6_5+loc7_5+loc8_5+loc9_5+loc10_5+loc11_5+ +loc12_5+loc13_5+loc14_5+loc15_5+loc16_5+loc17_5+loc18_5+loc19_5+loc20_5 prop_samp<-no.alleles_5/allalleles collect[i,d]<-prop_samp } } for(z in 1:no.sim.samples){ prop[2,z]=length(collect[,z][collect[,z]==1])/n.its } results_allloci=(rbind(prop,collect)) write.table(results_allloci,"results_nolowfreq_alleles.txt") plot(as.numeric(prop[2,])~as.numeric((prop[1,])),xlab="no. of individuals subsampled",ylab=("proportion of data sets capturing total no. of alleles")) abline(h=0.05,lty=2) ################################### ## leave samples with private alleles in ################# datasim<-reboot priv_all=read.table("sampwprivall.txt", header=T, sep="\t") priv_all=as.data.frame(priv_all[,1]) ############### # remove freq <0.02 ########## population=datasim for(B in 1:(length(priv_all[,1]))){ population=population[which(population[,1]!= as.character(priv_all[B,1])),] } dim(population) excluded=datasim toexcl=as.data.frame(population[,1]) for(B in 1:(length(toexcl[,1]))){ excluded=excluded[which(excluded[,1]!= as.character(toexcl[B,1])),] } ######## # check for errors ######## dim(datasim)[1]-dim(population)[1]-dim(excluded)[1] population=population[,-c(1)] L=ncol(population) locus_positions=(2*(unique(round((1:(L-2))/2)))+1) lnames=colnames(population) OUT=NULL for (x in locus_positions) { alleles=c(population[,x],population[,x+1]) alleles2=as.data.frame(table(alleles)) alleles3=alleles2[which(alleles2[,1]!=0),] alleles4=cbind(alleles3,alleles3[,2]/sum(alleles3[,2])) output=cbind(x,lnames[x],alleles4) OUT <<- rbind(OUT,output) } colnames(OUT) <- c("Number","Locus","allele","count","frequency") OUT2=OUT[which(OUT$frequency<0.02),] noXloci=dim(OUT2)[1] for (i in 1:noXloci) { population=population[which(population[,OUT2$Number[i]]!= OUT2$allele[i]),] population=population[which(population[,OUT2$Number[i]+1]!= OUT2$allele[i]),] } dim(population) population=rbind(population,excluded[,-c(1)] ) datasimbb=population dim(population) OUT3=NULL for (x in locus_positions) { alleles=c(datasimbb[,x],datasimbb[,x+1]) alleles2=as.data.frame(table(alleles)) alleles3=alleles2[which(alleles2[,1]!=0),] alleles4=cbind(alleles3,alleles3[,2]/sum(alleles3[,2])) output=cbind(x,lnames[x],alleles4) OUT3 <<- rbind(OUT3,output) } colnames(OUT3) <- c("Number","Locus","allele","count","frequency") allalleles=dim(OUT3)[1] prop=data.frame(matrix(0,2,no.sim.samples)) prop[1,]=c(1:no.sim.samples) collect<-data.frame(matrix(0,n.its,no.sim.samples)) names(collect)<-1:no.sim.samples names(prop)<-1:no.sim.samples tempX=as.data.frame(matrix(1,no.sim.samples,noloci*2)) names(tempX)=names(population) for (d in 1:no.sim.samples){ print(d) for(i in 1:n.its){ for(W in 1:noloci){ tempX[1:d,(W+(W-1))]=as.character(sample(OUT3[which(OUT3$Number==(W+(W-1))),3],d,repl=TRUE,prob=OUT3[which(OUT3$Number==(W+(W-1))),5])) tempX[1:d,(W+(W-1)+1)]=as.character(sample(OUT3[which(OUT3$Number==(W+(W-1))),3],d,repl=TRUE,prob=OUT3[which(OUT3$Number==(W+(W-1))),5])) } loc1_5<-length(sort(unique(c(unique(tempX[,1][1:d]),unique(tempX[,2][1:d]))))) loc2_5<-length(sort(unique(c(unique(tempX[,3][1:d]),unique(tempX[,4][1:d]))))) loc3_5<-length(sort(unique(c(unique(tempX[,5][1:d]),unique(tempX[,6][1:d]))))) loc4_5<-length(sort(unique(c(unique(tempX[,7][1:d]),unique(tempX[,8][1:d]))))) loc5_5<-length(sort(unique(c(unique(tempX[,9][1:d]),unique(tempX[,10][1:d]))))) loc6_5<-length(sort(unique(c(unique(tempX[,11][1:d]),unique(tempX[,12][1:d]))))) loc7_5<-length(sort(unique(c(unique(tempX[,13][1:d]),unique(tempX[,14][1:d]))))) loc8_5<-length(sort(unique(c(unique(tempX[,15][1:d]),unique(tempX[,16][1:d]))))) loc9_5<-length(sort(unique(c(unique(tempX[,17][1:d]),unique(tempX[,18][1:d]))))) loc10_5<-length(sort(unique(c(unique(tempX[,19][1:d]),unique(tempX[,20][1:d]))))) loc11_5<-length(sort(unique(c(unique(tempX[,21][1:d]),unique(tempX[,22][1:d]))))) loc12_5<-length(sort(unique(c(unique(tempX[,23][1:d]),unique(tempX[,24][1:d]))))) loc13_5<-length(sort(unique(c(unique(tempX[,25][1:d]),unique(tempX[,26][1:d]))))) loc14_5<-length(sort(unique(c(unique(tempX[,27][1:d]),unique(tempX[,28][1:d]))))) loc15_5<-length(sort(unique(c(unique(tempX[,29][1:d]),unique(tempX[,30][1:d]))))) loc16_5<-length(sort(unique(c(unique(tempX[,31][1:d]),unique(tempX[,32][1:d]))))) loc17_5<-length(sort(unique(c(unique(tempX[,33][1:d]),unique(tempX[,34][1:d]))))) loc18_5<-length(sort(unique(c(unique(tempX[,35][1:d]),unique(tempX[,36][1:d]))))) loc19_5<-length(sort(unique(c(unique(tempX[,37][1:d]),unique(tempX[,38][1:d]))))) loc20_5<-length(sort(unique(c(unique(tempX[,39][1:d]),unique(tempX[,40][1:d]))))) no.alleles_5<-loc1_5+loc2_5+loc3_5+loc4_5+loc5_5+loc6_5+loc7_5+loc8_5+loc9_5+loc10_5+loc11_5+ +loc12_5+loc13_5+loc14_5+loc15_5+loc16_5+loc17_5+loc18_5+loc19_5+loc20_5 prop_samp<-no.alleles_5/allalleles collect[i,d]<-prop_samp } } for(z in 1:no.sim.samples){ prop[2,z]=length(collect[,z][collect[,z]==1])/n.its } results_allloci=(rbind(prop,collect)) write.table(results_allloci,"results_nolowfreqnoprivate_alleles.txt") plot(as.numeric(prop[2,])~as.numeric((prop[1,])),xlab="no. of individuals subsampled",ylab=("proportion of data sets capturing total no. of alleles")) abline(h=0.05,lty=2)