## R script that implents the ProvenancePredictor algorithm
## (c) L. GENTZBITTEL, C. Ben and T.V. TATARINOVA, 2015


## requested data :
## A three columns file with sample name, Latitude and Longitude as columns (at least)
## the Q file of ADMIXTURE, with appropriate number of admixture components, in the same order as file with geo information


library(Cairo)
library(rworldmap)
library(rworldxtra) # Mapping global data, vector and raster

# clean working space
rm(list = ls())
graphics.off()


N_best <- 7  # number of 'closest' accessions to use, irrespective of the number of 'identical' accessions (that will be ties)  


# working directory :
Dir <- "/your/working/directory"


#################### Geographical data and other data for the full dataset (references and unknown)  ####################

# read data in csv format
GEOALL <- read.table(paste(Dir, "/path_to_your/data.csv", sep = ""), sep = ";", header = TRUE, row.names = 1)
colnames(GEOALL) <- c("Pop", "Country", "Lat", "Lon", "Elev")
str(GEOALL)


# keep accessions with known geo coordinates 
GEO <- GEOALL[!is.na(GEOALL$Lat), c('Lon', 'Lat')]
dim(GEO)


# How much ID are of unknow geo origin ?
Inconnues <- GEOALL[is.na(GEOALL$Lat), ]


#################### Ancestral genome proportions (admixture components) for data set ####################


# The data are from supervised admixture analysis (recommendend). The ADMIXTURE Q file is used.
# ideally the know samples are listed before the unknown samples.
ADMIXTURES0 <- as.matrix(read.table(paste(Dir, "/data/yourQfile.Q", sep = "")))

# select the first nk lines that correspond to the admixture components of known samples
nk <- xxx
ADMIXTURES <- ADMIXTURES0[1:nk, ]

# names of samples
rownames(ADMIXTURES) <- rownames(GEOALL)


# admixture components for unknown accessions
Inconnues2 <- data.frame(ADMIXTURES[rownames(ADMIXTURES) %in% rownames(Inconnues), ], GROUP = Inconnues$Pop)
# Which accessions are unknonw ?
write.table(Inconnues2, paste(Dir, "/output/inconnues.csv", sep = ""), sep = "\t")


GEN <- ADMIXTURES[!is.na(GEOALL$Lat), ]
dim(GEN)


####################   Look for a linear relationship between GEO et GEN distances 


    y=dist(GEO)  ## NOT Harversine.  based  lat/lon expressed in degrees.   CorrespondS to  A "naive Euclidean" distance may be computed between two points by
                                                                            # simply applying the Euclidean distance formula to the longitude-latitude
                                                                            # coordinates, and then multiplying by (Rpi/180) to convert to kilometers.
    R <- 6371 # Earth mean radius [km]
    ykm <- y*R*pi/180


    x=dist(GEN)  ## simple euclidean distance.


#################### Correlation between GEN & GEO matrices ?  ####################

library(ade4)


(RoughCorrels <- mantel.rtest(x, y, nrepet = 9999)) # might be usefull to debug with nrepet = 30 to reduce computation time.


x11()
par(mfrow = c(1,2),col = adjustcolor("blue", alpha.f = 0.2), pch = 19)
plot(x ~ y, xlab = 'geo', ylab = 'gen')
plot(x ~ ykm, xlab = 'geo km', ylab = 'gen')

x11()
par(col = adjustcolor("blue",alpha.f = 0.15),pch = 19)
scatter.smooth( y, x, evaluation = 500, lpars = list(col = "red", lwd = 3, lty = 3), pch = 19, xlab = 'geographic distance (euclidean)', ylab = 'genetic distance')
lines(x = c(8.25,8.25),y = c(0,1.4), lwd = 3, col = 'red')

x11()
par(col = adjustcolor("blue", alpha.f = 0.15), pch = 19)
scatter.smooth( ykm, x, evaluation = 500, lpars = list(col = "red", lwd = 3, lty = 3), pch = 19, xlab = 'geographic distance (km)', ylab = 'genetic distance')
lines(x = c(900,900), y = c(0,1.4), lwd = 3, col = 'red')


    LL <- length(y) ## number of two-by-two comparisons 

    ## filtering data based on above graph
    
    seuilGeo <- 8.25
    # in km : seuilGeo*R*pi/180
    seuilGen <- 1.2                                                        

    for (l in 1:LL) {
        if ( y[l] >= seuilGeo || x[l] >= seuilGen ) {y[l] = 0; x[l] = 0;}
    }


#################### Correlation between dGEN & dGEO matrices, after filtering on geo and gen distances  ####################


(FilteredCorrels <- mantel.rtest(x, y, nrepet = 9999))


eq1 <- lm(y ~ x);   # regressing geo on gen
summary(eq1)    # will be used to predict geo pos from genome admixtrue pattern


######################## declarations 

## Unknown accessions to be ProvenancePredictor'ed  

UNKNOWN_DATA <- read.csv(paste(Dir, "/output/inconnues.csv", sep = ""), header = TRUE, row.names = 1, sep = "\t")
# warning : GROUP required to be placed after admixture components

GROUPS <- unique(UNKNOWN_DATA$GROUP)  # GROUP_ID


outfile_name <- paste(Dir, "/output/inconnuesPredites.csv", sep = '')

    write("Population\tSample_no\tSample_id\tCentroidLon\tCentroidLat\tPredLon\tPredLat\tidentical accessions\tclosest accessions", outfile_name,  append=FALSE)

    N_best <-min(N_best, length(GEO[, 1]))  # useful ? unlikely that less reference accessions than min number of required 'closest' accessions


######################## A loop over each unkown accession

    for(GROUP in GROUPS){
        
        Y = UNKNOWN_DATA[UNKNOWN_DATA$GROUP == GROUP, ]

         K <- length(Y[,1])  # How much individuals per population ?

         for(a in 1: K)      # for each individual/sample
            {               
                
                X <- Y[a, 1:8]                                     # vector of K=8 admixture components for the sample 
                E <- rep(0, length(GEO[, 1]) ) ;                   # zero vector, of length 'number of reference accessions'
               
                for(g in 1: length(GEO[, 1])){
                    ethnic <- attributes(GEO[g,])$row.names;     
                    gene <- as.numeric(GEN[ethnic, 1:8])           
                    E[g] <- sqrt(sum((gene - X)^2))                # E contains euclidean distances of sample to reference accessions
                }
                
                              
                minEb <- NULL; minEb <- rank(E, ties.method = "min") <= (sum(rank(E, ties.method = "min") == 1) + N_best)         # N_best AND TIES=0
#                print(paste("# closest :",sum(minEb)))
                
                minEident <- rank(E,ties.method = "min") == 1
#                print(paste("# identical :",sum(minEident)))                                                              # number of accessions at 0 TIES=0

                # distances of unknow accession to other selected
                minEg <- E[minEb]                                                                                         # 
                sort(minEg)                                                                                               # 
                 

                # line numbers of 'closest selected' including 'identical', in increasing order of genetic distance
                minGb <- which(minEb == TRUE)[order(minEg)]                                         
                
                # line numbers of 'identical' accessions'
                minGident <- which(minEident == TRUE)

                # line numbers of 'closest BUT not identical'
                minGclose <- minGb[!minGb %in% minGident]

                
                #### centroid of identical accessions or centroid of 'closest' accessions
                if (length(minGident)>0) {
                    centroid <- colMeans(GEO[minGident,c('Lon','Lat')],na.rm=TRUE)} else                                     # 
                {centroid <- colMeans(GEO[minGclose[rank(minEg[minGclose],ties.method="min")==1],c('Lon','Lat')])}           ## use ties values if any 

  
                radius <- sort(minEg)                                                # why is it called radius ? # modif LG
                
                best_ethnic<- attributes(GEO[minEb,])$row.names;                     # names of N_best closests  # modif LG
                best_ethnic

                identical.ethnic <- attributes(GEO[minEident,])$row.names;
                identical.ethnic
                
                radius_geo=(eq1[[1]][2]*radius[1])                                  

                
                W <- ((minEg[1]+1e-9)/(minEg+1e-9))^(1/4);                           # weights for closests accessions

                 W=W/(sum(W));                                                       # 
                

                delta_lon <- GEO[minGb,][[1]]-centroid[1]                            # vectors for differences between BEST/centroid and N_best               
                delta_lat<- GEO[minGb,][[2]]-centroid[2]                             # vectors for differences between BEST/centroid and N_best         
                new_lon<-sum(W*delta_lon)                                           # using weights
                new_lat<-sum(W*delta_lat)                                           # using weights
                
                lo1<-new_lon*min(1,radius_geo/sqrt(new_lon^2+new_lat^2) )           # nice trick
                la1<-new_lat*min(1,radius_geo/sqrt(new_lon^2+new_lat^2))            # nice trick

                
                best <- NULL
                best[1] <- centroid[1]+lo1                                      
                best[2] <- centroid[2]+la1                                      

                
                 write(paste(GROUP, a, row.names(Y[a,]), centroid[1], centroid[2], best[1], best[2], paste(identical.ethnic,sep='',collapse="/"),paste(best_ethnic,sep='',collapse="/"), sep="\t"),outfile_name,append=TRUE)
            }

    }   # end of loop for GROUP


print("ProvenancePredictor is done!");


######################################################  additional code 
######################################################  detailed map, illustrating computations ######################################################


newmap <- getMap(resolution = "high")


Leave2 <- read.table(paste(Dir,"/output/inconnuesPredites.csv",sep=""),sep="\t", header=TRUE)
colnames(Leave2) <- c("Population","SampleNo","ID","LonCentroid","LatCentroid","LonPred","LatPred","identical","closest")

Connues <- read.table(paste(Dir,"/data/MTR_all.csv",sep=""),sep=";", header=TRUE)


CartographieDetaillee <- function(accession, sortie="S", detail='N', LongW=-8, LongE=37, LatS=28, LatN=45){


#    accession <- "HM101"   # prototyping

#    print(paste("accession : ",accession))
    miaou <- subset(Leave2, subset= ID %in% accession )
    
## a list with closests, per accession
decoupe <- function(x){

#    z <- as.character(miaou[miaou$ID==x,'identical'])
    y <- as.character(miaou[miaou$ID==x,'closest'])

#    copaing1 <- strsplit(z,"/")
    copaing2 <- strsplit(y,"/")
#    blurp <- list(copain1=copaing1, copain2=copaing2)
    blurp <- list( copain2=copaing2)
    return(blurp)
}

voisins <- lapply(miaou$ID,decoupe) 
names(voisins) <- miaou$ID

#print(voisins[[1]][[1]][1])
#voisins[[2]][[1]][1]


# 
#if (sortie == "S") {x11(width=14, height=8)} else {
#    CairoPDF(
#        file = paste(paste(Dir,"output/figsGPS/Map-",sep=''), accession,".pdf",sep=""),  
#         width = 14, height = 10, onefile = TRUE, family = "Helvetica",
#         fonts = NULL, version = "1.1",
#         pointsize=14,paper="A4")
#}

    
plot(newmap,ylim=c(LatS,LatN),xlim=c(LongW,LongE),axes=TRUE, col="gray95")

# original localizations
#points(miaou$LonOrig,miaou$LatOrig, pch=21,bg='bisque',cex=2)
#text(miaou$LonOrig,miaou$LatOrig,miaou$ID,cex=1, pos=c(1,2,3,4))

# centroid : centroid of identical  and/or closest entries including ties
points(miaou$LonCentroid,miaou$LatCentroid, pch=22, bg='firebrick' , cex=2)

# predicted location
points(miaou$LonPred,miaou$LatPred, pch=21, bg='red',cex=2.2 )

# from known to predicted location
#arrows(miaou$LonOrig,miaou$LatOrig,miaou$LonPred,miaou$LatPred, col=adjustcolor("black",alpha.f=1 ),code=2,length=0.2,angle=20,lwd=2)

# from centroid to predicted location
arrows(miaou$LonCentroid, miaou$LatCentroid, miaou$LonPred, miaou$LatPred, col=adjustcolor("green",alpha.f=1 ), code=2, length=0.1, angle=10, lwd=2)


######## loop over accessions, then loop over ' closest',  
traceVoisins <- function(x){

    x2 <- unlist(voisins[[x]][[1]]); # obligé car lapply renvoie une liste
#    print(x2)

    traceAccessions <- function(z,background="yellow"){
        
#        print(paste('Long:',Leave2[Leave2$ID==z,'LonOrig'],'Lat:',Leave2[Leave2$ID==z,'LatOrig']))

        # Locations of closest entries
        points(Connues[Connues$ID == z,'Longitude'], Connues[Connues$ID == z,'Latitude'], pch=21, bg="cyan",cex=1.5)
        text(Connues[Connues$ID == z,'Longitude'], Connues[Connues$ID == z,'Latitude'],Connues[Connues$ID == z,'ID'], cex=0.8, pos=c(1,2,3,4), offset=1)

        # from 'closest' to centroid 
        arrows(Connues[Connues$ID==z,'Longitude'], Connues[Connues$ID==z,'Latitude'], miaou[miaou$ID==names(voisins[x]),'LonCentroid'], miaou[miaou$ID==names(voisins[x]),'LatCentroid'], code=2, length=0.1, angle=20, lty=2)
    
    }
    
    if (detail=="Y") {sapply(x2,traceAccessions,background=rainbow(dim(miaou[1]))[x])}  # loop over closest entries
}


lapply(seq_along(voisins),traceVoisins)  #   loop over the samples

    if (sortie != "S" ) {dev.off()}
    
}


##############  How to use this function :

lapply(Leave2$ID[1:3], CartographieDetaillee, detail = "Y")  # one plot per accession, with closests tracés

CartographieDetaillee("HM101", detail = "Y") # Mediterranean Basin by default
CartographieDetaillee("HM101", detail = "Y", LongW = -6, LongE = 5, LatS = 34, LatN = 43 ) # zoom 

CartographieDetaillee(c("HM111", "HM112"), detail = "Y")  # two accessions on the same plot

CartographieDetaillee(Leave2$ID[1:4])          # one plot for 4 accessions. By default : screen and NO details