## R script that implents the ProvenancePredictor algorithm ## (c) L. GENTZBITTEL, C. Ben and T.V. TATARINOVA, 2015 ## requested data : ## A three columns file with sample name, Latitude and Longitude as columns (at least) ## the Q file of ADMIXTURE, with appropriate number of admixture components, in the same order as file with geo information library(Cairo) library(rworldmap) library(rworldxtra) # Mapping global data, vector and raster # clean working space rm(list = ls()) graphics.off() N_best <- 7 # number of 'closest' accessions to use, irrespective of the number of 'identical' accessions (that will be ties) # working directory : Dir <- "/your/working/directory" #################### Geographical data and other data for the full dataset (references and unknown) #################### # read data in csv format GEOALL <- read.table(paste(Dir, "/path_to_your/data.csv", sep = ""), sep = ";", header = TRUE, row.names = 1) colnames(GEOALL) <- c("Pop", "Country", "Lat", "Lon", "Elev") str(GEOALL) # keep accessions with known geo coordinates GEO <- GEOALL[!is.na(GEOALL$Lat), c('Lon', 'Lat')] dim(GEO) # How much ID are of unknow geo origin ? Inconnues <- GEOALL[is.na(GEOALL$Lat), ] #################### Ancestral genome proportions (admixture components) for data set #################### # The data are from supervised admixture analysis (recommendend). The ADMIXTURE Q file is used. # ideally the know samples are listed before the unknown samples. ADMIXTURES0 <- as.matrix(read.table(paste(Dir, "/data/yourQfile.Q", sep = ""))) # select the first nk lines that correspond to the admixture components of known samples nk <- xxx ADMIXTURES <- ADMIXTURES0[1:nk, ] # names of samples rownames(ADMIXTURES) <- rownames(GEOALL) # admixture components for unknown accessions Inconnues2 <- data.frame(ADMIXTURES[rownames(ADMIXTURES) %in% rownames(Inconnues), ], GROUP = Inconnues$Pop) # Which accessions are unknonw ? write.table(Inconnues2, paste(Dir, "/output/inconnues.csv", sep = ""), sep = "\t") GEN <- ADMIXTURES[!is.na(GEOALL$Lat), ] dim(GEN) #################### Look for a linear relationship between GEO et GEN distances y=dist(GEO) ## NOT Harversine. based lat/lon expressed in degrees. CorrespondS to A "naive Euclidean" distance may be computed between two points by # simply applying the Euclidean distance formula to the longitude-latitude # coordinates, and then multiplying by (Rpi/180) to convert to kilometers. R <- 6371 # Earth mean radius [km] ykm <- y*R*pi/180 x=dist(GEN) ## simple euclidean distance. #################### Correlation between GEN & GEO matrices ? #################### library(ade4) (RoughCorrels <- mantel.rtest(x, y, nrepet = 9999)) # might be usefull to debug with nrepet = 30 to reduce computation time. x11() par(mfrow = c(1,2),col = adjustcolor("blue", alpha.f = 0.2), pch = 19) plot(x ~ y, xlab = 'geo', ylab = 'gen') plot(x ~ ykm, xlab = 'geo km', ylab = 'gen') x11() par(col = adjustcolor("blue",alpha.f = 0.15),pch = 19) scatter.smooth( y, x, evaluation = 500, lpars = list(col = "red", lwd = 3, lty = 3), pch = 19, xlab = 'geographic distance (euclidean)', ylab = 'genetic distance') lines(x = c(8.25,8.25),y = c(0,1.4), lwd = 3, col = 'red') x11() par(col = adjustcolor("blue", alpha.f = 0.15), pch = 19) scatter.smooth( ykm, x, evaluation = 500, lpars = list(col = "red", lwd = 3, lty = 3), pch = 19, xlab = 'geographic distance (km)', ylab = 'genetic distance') lines(x = c(900,900), y = c(0,1.4), lwd = 3, col = 'red') LL <- length(y) ## number of two-by-two comparisons ## filtering data based on above graph seuilGeo <- 8.25 # in km : seuilGeo*R*pi/180 seuilGen <- 1.2 for (l in 1:LL) { if ( y[l] >= seuilGeo || x[l] >= seuilGen ) {y[l] = 0; x[l] = 0;} } #################### Correlation between dGEN & dGEO matrices, after filtering on geo and gen distances #################### (FilteredCorrels <- mantel.rtest(x, y, nrepet = 9999)) eq1 <- lm(y ~ x); # regressing geo on gen summary(eq1) # will be used to predict geo pos from genome admixtrue pattern ######################## declarations ## Unknown accessions to be ProvenancePredictor'ed UNKNOWN_DATA <- read.csv(paste(Dir, "/output/inconnues.csv", sep = ""), header = TRUE, row.names = 1, sep = "\t") # warning : GROUP required to be placed after admixture components GROUPS <- unique(UNKNOWN_DATA$GROUP) # GROUP_ID outfile_name <- paste(Dir, "/output/inconnuesPredites.csv", sep = '') write("Population\tSample_no\tSample_id\tCentroidLon\tCentroidLat\tPredLon\tPredLat\tidentical accessions\tclosest accessions", outfile_name, append=FALSE) N_best <-min(N_best, length(GEO[, 1])) # useful ? unlikely that less reference accessions than min number of required 'closest' accessions ######################## A loop over each unkown accession for(GROUP in GROUPS){ Y = UNKNOWN_DATA[UNKNOWN_DATA$GROUP == GROUP, ] K <- length(Y[,1]) # How much individuals per population ? for(a in 1: K) # for each individual/sample { X <- Y[a, 1:8] # vector of K=8 admixture components for the sample E <- rep(0, length(GEO[, 1]) ) ; # zero vector, of length 'number of reference accessions' for(g in 1: length(GEO[, 1])){ ethnic <- attributes(GEO[g,])$row.names; gene <- as.numeric(GEN[ethnic, 1:8]) E[g] <- sqrt(sum((gene - X)^2)) # E contains euclidean distances of sample to reference accessions } minEb <- NULL; minEb <- rank(E, ties.method = "min") <= (sum(rank(E, ties.method = "min") == 1) + N_best) # N_best AND TIES=0 # print(paste("# closest :",sum(minEb))) minEident <- rank(E,ties.method = "min") == 1 # print(paste("# identical :",sum(minEident))) # number of accessions at 0 TIES=0 # distances of unknow accession to other selected minEg <- E[minEb] # sort(minEg) # # line numbers of 'closest selected' including 'identical', in increasing order of genetic distance minGb <- which(minEb == TRUE)[order(minEg)] # line numbers of 'identical' accessions' minGident <- which(minEident == TRUE) # line numbers of 'closest BUT not identical' minGclose <- minGb[!minGb %in% minGident] #### centroid of identical accessions or centroid of 'closest' accessions if (length(minGident)>0) { centroid <- colMeans(GEO[minGident,c('Lon','Lat')],na.rm=TRUE)} else # {centroid <- colMeans(GEO[minGclose[rank(minEg[minGclose],ties.method="min")==1],c('Lon','Lat')])} ## use ties values if any radius <- sort(minEg) # why is it called radius ? # modif LG best_ethnic<- attributes(GEO[minEb,])$row.names; # names of N_best closests # modif LG best_ethnic identical.ethnic <- attributes(GEO[minEident,])$row.names; identical.ethnic radius_geo=(eq1[[1]][2]*radius[1]) W <- ((minEg[1]+1e-9)/(minEg+1e-9))^(1/4); # weights for closests accessions W=W/(sum(W)); # delta_lon <- GEO[minGb,][[1]]-centroid[1] # vectors for differences between BEST/centroid and N_best delta_lat<- GEO[minGb,][[2]]-centroid[2] # vectors for differences between BEST/centroid and N_best new_lon<-sum(W*delta_lon) # using weights new_lat<-sum(W*delta_lat) # using weights lo1<-new_lon*min(1,radius_geo/sqrt(new_lon^2+new_lat^2) ) # nice trick la1<-new_lat*min(1,radius_geo/sqrt(new_lon^2+new_lat^2)) # nice trick best <- NULL best[1] <- centroid[1]+lo1 best[2] <- centroid[2]+la1 write(paste(GROUP, a, row.names(Y[a,]), centroid[1], centroid[2], best[1], best[2], paste(identical.ethnic,sep='',collapse="/"),paste(best_ethnic,sep='',collapse="/"), sep="\t"),outfile_name,append=TRUE) } } # end of loop for GROUP print("ProvenancePredictor is done!"); ###################################################### additional code ###################################################### detailed map, illustrating computations ###################################################### newmap <- getMap(resolution = "high") Leave2 <- read.table(paste(Dir,"/output/inconnuesPredites.csv",sep=""),sep="\t", header=TRUE) colnames(Leave2) <- c("Population","SampleNo","ID","LonCentroid","LatCentroid","LonPred","LatPred","identical","closest") Connues <- read.table(paste(Dir,"/data/MTR_all.csv",sep=""),sep=";", header=TRUE) CartographieDetaillee <- function(accession, sortie="S", detail='N', LongW=-8, LongE=37, LatS=28, LatN=45){ # accession <- "HM101" # prototyping # print(paste("accession : ",accession)) miaou <- subset(Leave2, subset= ID %in% accession ) ## a list with closests, per accession decoupe <- function(x){ # z <- as.character(miaou[miaou$ID==x,'identical']) y <- as.character(miaou[miaou$ID==x,'closest']) # copaing1 <- strsplit(z,"/") copaing2 <- strsplit(y,"/") # blurp <- list(copain1=copaing1, copain2=copaing2) blurp <- list( copain2=copaing2) return(blurp) } voisins <- lapply(miaou$ID,decoupe) names(voisins) <- miaou$ID #print(voisins[[1]][[1]][1]) #voisins[[2]][[1]][1] # #if (sortie == "S") {x11(width=14, height=8)} else { # CairoPDF( # file = paste(paste(Dir,"output/figsGPS/Map-",sep=''), accession,".pdf",sep=""), # width = 14, height = 10, onefile = TRUE, family = "Helvetica", # fonts = NULL, version = "1.1", # pointsize=14,paper="A4") #} plot(newmap,ylim=c(LatS,LatN),xlim=c(LongW,LongE),axes=TRUE, col="gray95") # original localizations #points(miaou$LonOrig,miaou$LatOrig, pch=21,bg='bisque',cex=2) #text(miaou$LonOrig,miaou$LatOrig,miaou$ID,cex=1, pos=c(1,2,3,4)) # centroid : centroid of identical and/or closest entries including ties points(miaou$LonCentroid,miaou$LatCentroid, pch=22, bg='firebrick' , cex=2) # predicted location points(miaou$LonPred,miaou$LatPred, pch=21, bg='red',cex=2.2 ) # from known to predicted location #arrows(miaou$LonOrig,miaou$LatOrig,miaou$LonPred,miaou$LatPred, col=adjustcolor("black",alpha.f=1 ),code=2,length=0.2,angle=20,lwd=2) # from centroid to predicted location arrows(miaou$LonCentroid, miaou$LatCentroid, miaou$LonPred, miaou$LatPred, col=adjustcolor("green",alpha.f=1 ), code=2, length=0.1, angle=10, lwd=2) ######## loop over accessions, then loop over ' closest', traceVoisins <- function(x){ x2 <- unlist(voisins[[x]][[1]]); # obligé car lapply renvoie une liste # print(x2) traceAccessions <- function(z,background="yellow"){ # print(paste('Long:',Leave2[Leave2$ID==z,'LonOrig'],'Lat:',Leave2[Leave2$ID==z,'LatOrig'])) # Locations of closest entries points(Connues[Connues$ID == z,'Longitude'], Connues[Connues$ID == z,'Latitude'], pch=21, bg="cyan",cex=1.5) text(Connues[Connues$ID == z,'Longitude'], Connues[Connues$ID == z,'Latitude'],Connues[Connues$ID == z,'ID'], cex=0.8, pos=c(1,2,3,4), offset=1) # from 'closest' to centroid arrows(Connues[Connues$ID==z,'Longitude'], Connues[Connues$ID==z,'Latitude'], miaou[miaou$ID==names(voisins[x]),'LonCentroid'], miaou[miaou$ID==names(voisins[x]),'LatCentroid'], code=2, length=0.1, angle=20, lty=2) } if (detail=="Y") {sapply(x2,traceAccessions,background=rainbow(dim(miaou[1]))[x])} # loop over closest entries } lapply(seq_along(voisins),traceVoisins) # loop over the samples if (sortie != "S" ) {dev.off()} } ############## How to use this function : lapply(Leave2$ID[1:3], CartographieDetaillee, detail = "Y") # one plot per accession, with closests tracés CartographieDetaillee("HM101", detail = "Y") # Mediterranean Basin by default CartographieDetaillee("HM101", detail = "Y", LongW = -6, LongE = 5, LatS = 34, LatN = 43 ) # zoom CartographieDetaillee(c("HM111", "HM112"), detail = "Y") # two accessions on the same plot CartographieDetaillee(Leave2$ID[1:4]) # one plot for 4 accessions. By default : screen and NO details