## R script that implements the leave-one-out procedure for ProvenancePredictor ## (c) L. GENTZBITTEL, C. BEN and T.V. TATARINOVA 2015 ## requested libraries : ## library(Cairo) library(rworldmap) library(rworldxtra) # Mapping global data, vector and raster # cleaning work space rm(list = ls()) graphics.off() ## declaration of the function. ## entry : name of outfile, N_best, file of unknow samples, folder with GEN and GEO files N_best <- 7 # working directory : Dir <- "/your/working/directory/" #################### Reported geo data for samples #################### GEOALL <- read.table(paste(Dir,"/data/your_data.csv", sep = ''), sep = ";", header = TRUE, row.names = 1) colnames(GEOALL) <- c("Pop", "Country", "Lat", "Lon", "Elev") str(GEOALL) dim(GEOALL) # keep accessions with known geo coordinates GEO <- GEOALL[!is.na(GEOALL$Lat), c('Lon', 'Lat')] dim(GEO) # How much ID are of unknow geo origin ? GEOALL[is.na(GEOALL$Lat), ] #################### Admixture components #################### # The data are from supervised admixture analysis (recommendend). The ADMIXTURE Q file is used. # ideally the know samples are listed before the unknown samples. ADMIXTURES0 <- as.matrix(read.table(paste(Dir,"/data/admixturePatterns/Your_File.Q",sep=""))) nk <- xxx # number of samples with reported geo location ADMIXTURES <- ADMIXTURES0[1:nk, ] # names of samples rownames(ADMIXTURES) <- rownames(GEOALL) ######### clean 'small proportions' PlayWithRows <- function(x){ # x is 'a row' of the object to which the function is applied y <- NULL # initialize an object for hodling the results y <- ifelse(x< 0.00001,0, ifelse(x> 0.99999,1, x)) # first component of Y is 3 * first component of x # y <- ifelse(x> 0.99999,1, x) return(y) } ADMIXTURES2 <- t(apply(ADMIXTURES, 1, PlayWithRows)) GEN <- ADMIXTURES2[!is.na(GEOALL$Lat),] dim(GEN) #################### Look for a linear relationship between GEO et GEN distances #################### y=dist(GEO) ## NOT Harversine. based lat/lon expressed in degrees. CorrespondS to A "naive Euclidean" distance may be computed between two points by # simply applying the Euclidean distance formula to the longitude-latitude # coordinates, and then multiplying by (Rpi/180) to convert to kilometers. R <- 6371 # Earth mean radius [km] ykm <- y*R*pi/180 x=dist(GEN) ## euclidian distance xOrig <- as.matrix(x) #################### Correlation between GEN & GEO matrices ? #################### library(ade4) (RoughCorrels <- mantel.rtest(x, y, nrepet = 9999)) # might be usefull to debug with nrepet = 30 to reduce computation time. LL <- length(y) ## number of two-by-two comparisons ## filtering data based on above graph seuilGeo <- 8.25 seuilGen <- 1.2 for (l in 1:LL) { if ( y[l] >= seuilGeo || x[l] >= seuilGen ) {y[l] = 0; x[l] = 0;} } #################### Correlation between dGEN & dGEO matrices, after filtering on geo and gen distances #################### (FilteredCorrels <- mantel.rtest(x, y, nrepet = 9999)) # might be usefull to debug with nrepet = 30 to reduce computation time. eq1<-lm(y~x); # regressing geo on gen summary(eq1) # will be used to predict geo pos from genome admixtrue pattern #################### ProvenancePredictor Leave-one-Out outfile_name <- paste(Dir,"/output/Your_OutputFile.csv", sep = '') write("ID\tObsLon\tObsLat\tCentroidLon\tCentroidLatLat\tPredLon\tPredLat\tdistance(km)\tidentical accessions\tclosest accessions\tK8.1\tK8.2\tK8.3\tK8.4\tK8.5\tK8.6\tK8.7\tK8.8", outfile_name, append=FALSE) # need to adjust depending on the number of admixture components used. N_best <- min(N_best, length(GEO[, 1])) # number of closest samples for an unknown accession ######################## A loop for each accession for ( retire in 1:length(rownames(GEO)) ){ GROUP <- rownames(GEO)[retire] num.ligne <- retire print(num.ligne) Y <- as.data.frame(t(GEN[ rownames(GEN) == GROUP, ])) ########### weird syntax ! K <- length(Y[, 1]) # How much individuals per pop for(a in 1: K) # for each sample { X <- Y[a, 1:8] # admixture vector for that sample E <- rep(0, length(GEO[, 1]) ) ; # a zero vector, on length 'pop number' for(g in 1: length(GEO[,1])){ ethnic <- attributes(GEO[g, ])$row.names; # reference population gene <- as.numeric(GEN[ethnic, 1:8]) # admixture proportions E[g] <- sqrt(sum((gene - X)^2)) # distacne from sample to ref. populations } # identical and closest lines (with ties) minEb <- NULL; minEb <- rank(E,ties.method="min") <= (sum(rank(E, ties.method="min") == 1) + N_best) minEb[num.ligne] <- FALSE # LEAVE ONE OUT ! print(paste("# closest :", sum(minEb))) # number of accessions at 0 AND 'N_best closest' (excluding the one) minEident <- rank(E,ties.method="min")==1 # TIES=0 minEident[num.ligne] <- FALSE print(paste("# identical :", sum(minEident))) # number of accessions at 0 (excluding the one) # distances from "leaved accession" to other selected minEg <- E[minEb] # sort(minEg) # to list # line numbers of 'closest selected' including 'identical', in increasing order of genetic distance minGb <- which(minEb == TRUE)[order(minEg)] # needed for centroid # line numbers of 'identical' accessions' minGident <- which(minEident == TRUE) # line numbers of 'closest BUT not identical' minGclose <- minGb[!minGb %in% minGident] #### centroid of ties or centroid of closets, including ties if (length(minGident)>0) { centroid <- colMeans(GEO[minGident,c('Lon','Lat')],na.rm=TRUE)} else {centroid <- colMeans(GEO[minGclose[rank(minEg[minGclose], ties.method="min") == 1], c('Lon','Lat')])} # distances, from "leaved accession" to other selected, in increasing order radius <- sort(minEg) best.ethnic<- attributes(GEO[minGb,])$row.names; #best_ethnic; # names of N_best closests best.ethnic identical.ethnic <- attributes(GEO[minEident,])$row.names; identical.ethnic radius_geo <- (eq1[[1]][2] * radius[1]) W <- ((radius[1] + 1e-9)/(radius + 1e-9))^(1/4) W=W/(sum(W)); delta_lon <- GEO[minGb, ][[1]] - centroid[1] # vectors of differences between BEST/centroid and N_best delta_lat <- GEO[minGb, ][[2]] - centroid[2] # vectors of differences between BEST/centroid and N_best new_lon <- sum(W*delta_lon) # weigthing new_lat <- sum(W*delta_lat) # weighting lo1<-new_lon*min(1,radius_geo/sqrt(new_lon^2+new_lat^2) ) la1<-new_lat*min(1,radius_geo/sqrt(new_lon^2+new_lat^2)) best <- NULL best[1] <- centroid[1] + lo1 best[2] <- centroid[2] + la1 best <- unlist(best) distancekm <- sqrt((best[1]- GEO[num.ligne,1])^2 + (best[2] - GEO[num.ligne,2])^2) * R * pi/180 write(paste(GROUP, GEO[num.ligne,1], GEO[num.ligne,2] , centroid[1], centroid[2], best[1], best[2], distancekm, paste(identical.ethnic,sep='',collapse="/"), paste(best.ethnic,sep='',collapse="/"), X[1], X[2], X[3], X[4], X[5], X[6], X[7], X[8], sep="\t"), outfile_name, append=TRUE) } } print("ProvenancePredictor leave-one-out is done!") #################### ecdf of predicted vs observed isualisations des GPS leave-one Out Dir <- "/your/working/directory/" Leave2 <- read.table(paste(Dir,"/output/Your_OutputFile.csv",sep=""),sep="\t", header=TRUE) colnames(Leave2) <- c("ID","LonOrig","LatOrig","LonCentroid","LatCentroid","LonPred","LatPred","distance","identical","closest","K1","K2","K3","K4","K5","K6","K7","K8") mean(Leave2$distance) median(Leave2$distance) toto <- ecdf(Leave2$distance) x11() plot(toto, xlab="distance (km)", ylab="proportion of samples", main="", verticals = TRUE, col.points = "blue", col.hor = "red", col.vert = "bisque") abline(h=0.5, v=214, col="red", lwd=1) ########################## Maps to compare observed and predicted locations. newmap <- getMap(resolution = "high") ######## Mediterranean basin x11() plot(newmap, ylim = c(28,45),xlim=c(-10,40),axes=TRUE, col="gray95") points(Leave2$LonOrig, Leave2$LatOrig, pch=21, bg='darkgreen' ) points(Leave2$LonPred, Leave2$LatPred, pch=21, bg='red' ) arrows(Leave2$LonOrig, Leave2$LatOrig, Leave2$LonPred, Leave2$LatPred, col = adjustcolor("darkblue",alpha.f = 0.7), code = 2, length = 0.1, angle = 20) ###################################################### detailed maps, illustrating computations ###################################################### CartographieDetaillee <- function(accession, sortie = "S", detail = 'N'){ print(paste("accession : ",accession)) miaou <- subset(Leave2, subset = ID %in% accession ) ## a list with closest samples, per accession decoupe <- function(x){ y <- as.character(miaou[miaou$ID == x,'closest']) copaing2 <- strsplit(y,"/") blurp <- list( copain2 = copaing2) return(blurp) } voisins <- lapply(miaou$ID,decoupe) names(voisins) <- miaou$ID #print(voisins[[1]][[1]][1]) #voisins[[2]][[1]][1] if (sortie == "S") {x11()} else { CairoPDF( file = paste(paste(Dir, "output/figsProvPredictor/Map-", sep = ''), accession,"_", round(miaou$distance,0), "km.pdf",sep=""), width = 25, height = 15, onefile = TRUE, family = "Helvetica", fonts = NULL, version = "1.1", pointsize=14,paper="A4") } plot(newmap,ylim=c(28,45),xlim=c(-8,37),axes=TRUE, col="gray95") # Mediterranean basin # reported location points(miaou$LonOrig, miaou$LatOrig, pch=21, bg='bisque', cex=2) text(miaou$LonOrig, miaou$LatOrig, miaou$ID, cex=1, pos=c(1,2,3,4)) # centroid : points(miaou$LonCentroid, miaou$LatCentroid, pch=22, bg='firebrick', cex=2) # predicted location points(miaou$LonPred, miaou$LatPred, pch=21, bg='red', cex=2.2 ) # from reported to predicted location arrows(miaou$LonOrig,miaou$LatOrig,miaou$LonPred,miaou$LatPred, col=adjustcolor("black",alpha.f=1 ),code=2,length=0.2,angle=20,lwd=2) # from centroid to predicted location arrows(miaou$LonCentroid, miaou$LatCentroid, miaou$LonPred, miaou$LatPred, col=adjustcolor("green",alpha.f=1 ), code=2, length=0.1, angle=10, lwd=2) ######## loop for samples, loop for closests traceVoisins <- function(x){ x2 <- unlist(voisins[[x]][[1]]); # weird ! lapply retuen a list # print(x2) traceAccessions <- function(z,background="yellow"){ # print(paste('Long:',Leave2[Leave2$ID==z,'LonOrig'],'Lat:',Leave2[Leave2$ID==z,'LatOrig'])) # 'closest' locations points(Leave2[Leave2$ID==z,'LonOrig'], Leave2[Leave2$ID==z,'LatOrig'], pch=21, bg=background) text(Leave2[Leave2$ID==z,'LonOrig'], Leave2[Leave2$ID==z,'LatOrig'],Leave2[Leave2$ID==z,'ID'],cex=0.8,pos=c(1,2,3,4),offset=0.8) # from 'closest' to centroid arrows(Leave2[Leave2$ID==z,'LonOrig'], Leave2[Leave2$ID==z,'LatOrig'], miaou[miaou$ID==names(voisins[x]),'LonCentroid'], miaou[miaou$ID==names(voisins[x]),'LatCentroid'], code=2, length=0.1, angle=20, lty=2) } if (detail=="Y") {sapply(x2,traceAccessions,background=rainbow(dim(miaou[1]))[x])} } lapply(seq_along(voisins), traceVoisins) # THIS is the loop on accessions as arguments if (sortie != "S" ) {dev.off()} } ############## How to use this function : lapply(Leave2$ID[1:2], CartographieDetaillee, detail = "Y") # uneone plot per accession CartographieDetaillee("HM191", detail = "Y") CartographieDetaillee(c("HM191", "HM064"), detail = "Y") CartographieDetaillee(Leave2$ID[1:4]) # by default : screen and NO details. 4 accessions on the same plot ############# BEWARE !! LENGTHY ! lapply(Leave2$ID, CartographieDetaillee, detail = "Y", sortie = "P") # one plot per accession for ALL accessions, output in pdf format