## R script that implements the leave-one-out procedure for ProvenancePredictor
## (c) L. GENTZBITTEL, C. BEN and T.V. TATARINOVA 2015


## requested libraries :
## 

library(Cairo)
library(rworldmap)
library(rworldxtra) # Mapping global data, vector and raster


# cleaning work space
rm(list = ls())
graphics.off()


## declaration of the function.
## entry : name of outfile, N_best, file of unknow samples, folder with GEN and GEO files 


N_best <- 7 


# working directory :
Dir <- "/your/working/directory/"


#################### Reported geo data for samples  ####################

GEOALL <- read.table(paste(Dir,"/data/your_data.csv", sep = ''), sep = ";", header = TRUE, row.names = 1)

colnames(GEOALL) <- c("Pop", "Country", "Lat", "Lon", "Elev")
str(GEOALL)
dim(GEOALL)


# keep accessions with known geo coordinates 
GEO <- GEOALL[!is.na(GEOALL$Lat), c('Lon', 'Lat')]
dim(GEO)

# How much ID are of unknow geo origin ?
GEOALL[is.na(GEOALL$Lat), ]


#################### Admixture components ####################

# The data are from supervised admixture analysis (recommendend). The ADMIXTURE Q file is used.
# ideally the know samples are listed before the unknown samples.

ADMIXTURES0 <- as.matrix(read.table(paste(Dir,"/data/admixturePatterns/Your_File.Q",sep="")))

nk <- xxx  # number of samples with reported geo location
ADMIXTURES <- ADMIXTURES0[1:nk, ]

# names of samples
rownames(ADMIXTURES) <- rownames(GEOALL)


######### clean 'small proportions'
PlayWithRows <- function(x){    # x is 'a row' of the object to which the function is applied
    y <- NULL                   # initialize an object for hodling the results
    y <- ifelse(x< 0.00001,0, ifelse(x> 0.99999,1, x))           # first component of Y is 3 * first component of x
#   y <- ifelse(x> 0.99999,1, x)
    return(y)
}


ADMIXTURES2 <- t(apply(ADMIXTURES, 1, PlayWithRows))

GEN <- ADMIXTURES2[!is.na(GEOALL$Lat),]
dim(GEN)


#################### Look for a linear relationship between GEO et GEN distances  ####################

    y=dist(GEO)   ## NOT Harversine.  based  lat/lon expressed in degrees.   CorrespondS to  A "naive Euclidean" distance may be computed between two points by                                                             # simply applying the Euclidean distance formula to the longitude-latitude
                                                 # coordinates, and then multiplying by (Rpi/180) to convert to kilometers.
    R <- 6371 # Earth mean radius [km]
    ykm <- y*R*pi/180


    x=dist(GEN)  ## euclidian distance
    xOrig <- as.matrix(x)


#################### Correlation between GEN & GEO matrices ?  ####################

library(ade4)

(RoughCorrels <- mantel.rtest(x, y, nrepet = 9999)) # might be usefull to debug with nrepet = 30 to reduce computation time.


    LL <- length(y) ## number of two-by-two comparisons 

    ## filtering data based on above graph
    seuilGeo <- 8.25
    seuilGen <- 1.2                                                        

    for (l in 1:LL) {
        if ( y[l] >= seuilGeo || x[l] >= seuilGen ) {y[l] = 0; x[l] = 0;}
    }


#################### Correlation between dGEN & dGEO matrices, after filtering on geo and gen distances  ####################

(FilteredCorrels <- mantel.rtest(x, y, nrepet = 9999))  # might be usefull to debug with nrepet = 30 to reduce computation time.


eq1<-lm(y~x);   # regressing geo on gen
summary(eq1)    # will be used to predict geo pos from genome admixtrue pattern


####################  ProvenancePredictor Leave-one-Out


outfile_name <- paste(Dir,"/output/Your_OutputFile.csv", sep = '')

    write("ID\tObsLon\tObsLat\tCentroidLon\tCentroidLatLat\tPredLon\tPredLat\tdistance(km)\tidentical accessions\tclosest accessions\tK8.1\tK8.2\tK8.3\tK8.4\tK8.5\tK8.6\tK8.7\tK8.8", outfile_name, append=FALSE)  # need to adjust depending on the number of admixture components used.

    N_best <- min(N_best, length(GEO[, 1]))  # number of closest samples for an unknown accession


######################## A loop for each accession

    for ( retire in 1:length(rownames(GEO)) ){
    
        GROUP <- rownames(GEO)[retire]
        num.ligne <- retire
        print(num.ligne)
        
        Y <- as.data.frame(t(GEN[ rownames(GEN) == GROUP, ]))  ########### weird syntax !

         K <- length(Y[, 1])  # How much individuals per pop
        
         for(a in 1: K)      # for each sample 
            {                              
                X <- Y[a, 1:8]                                     # admixture vector for that sample 
                E <- rep(0, length(GEO[, 1]) ) ;                   # a zero vector, on length 'pop number'
               
                for(g in 1: length(GEO[,1])){
                    ethnic <- attributes(GEO[g, ])$row.names;      # reference population 
                    gene <- as.numeric(GEN[ethnic, 1:8])           # admixture proportions 
                    E[g] <- sqrt(sum((gene - X)^2))                # distacne from sample to ref. populations
                }
                

                # identical and closest lines (with ties) 
                minEb <- NULL; minEb <- rank(E,ties.method="min") <= (sum(rank(E, ties.method="min") == 1) + N_best)   
                minEb[num.ligne] <- FALSE                                                                              # LEAVE ONE OUT !
                print(paste("# closest :", sum(minEb)))                                                                # number of accessions at 0 AND 'N_best closest' (excluding the one)
                
                minEident <- rank(E,ties.method="min")==1                                                              # TIES=0
                minEident[num.ligne] <- FALSE          
                print(paste("# identical :", sum(minEident)))                                                           # number of accessions at 0 (excluding the one)

                # distances from "leaved accession" to other selected
                minEg <- E[minEb]                                                                                      #
                sort(minEg)                                                                                            # to list

                # line numbers of 'closest selected' including 'identical', in increasing order of genetic distance
                minGb <- which(minEb == TRUE)[order(minEg)]                                                              # needed for  centroid
                # line numbers of 'identical' accessions' 
                minGident <- which(minEident == TRUE)
                # line numbers of 'closest BUT not identical'
                minGclose <- minGb[!minGb %in% minGident]

                
                #### centroid of ties or centroid of closets, including ties 
                if (length(minGident)>0) {
                    centroid <- colMeans(GEO[minGident,c('Lon','Lat')],na.rm=TRUE)} else                                     
                {centroid <- colMeans(GEO[minGclose[rank(minEg[minGclose], ties.method="min") == 1], c('Lon','Lat')])}         
              
                
                # distances, from "leaved accession" to other selected, in increasing order
                radius <- sort(minEg)                                                                                       
                
                best.ethnic<- attributes(GEO[minGb,])$row.names; #best_ethnic;                                              # names of N_best closests
                best.ethnic

                identical.ethnic <- attributes(GEO[minEident,])$row.names;
                identical.ethnic
                
                radius_geo <- (eq1[[1]][2] * radius[1])                                 

                W <- ((radius[1] + 1e-9)/(radius + 1e-9))^(1/4)
                W=W/(sum(W));                                                       
                
                delta_lon <- GEO[minGb, ][[1]] - centroid[1]                          # vectors of differences between BEST/centroid and N_best           
                delta_lat <- GEO[minGb, ][[2]] - centroid[2]                          # vectors of differences between BEST/centroid and N_best                    
                new_lon <- sum(W*delta_lon)                                           # weigthing
                new_lat <- sum(W*delta_lat)                                           # weighting
                
                lo1<-new_lon*min(1,radius_geo/sqrt(new_lon^2+new_lat^2) )           
                la1<-new_lat*min(1,radius_geo/sqrt(new_lon^2+new_lat^2))            

                
                best <- NULL
                best[1] <- centroid[1] + lo1                                       
                best[2] <- centroid[2] + la1                                       
                best <- unlist(best)

                distancekm <- sqrt((best[1]- GEO[num.ligne,1])^2 + (best[2] - GEO[num.ligne,2])^2) * R * pi/180
                
                
                 write(paste(GROUP, GEO[num.ligne,1], GEO[num.ligne,2] , centroid[1], centroid[2], best[1], best[2], distancekm, paste(identical.ethnic,sep='',collapse="/"), paste(best.ethnic,sep='',collapse="/"), X[1], X[2], X[3], X[4], X[5], X[6], X[7], X[8], sep="\t"), outfile_name, append=TRUE)
                
            }

    }  

print("ProvenancePredictor leave-one-out is done!")


#################### ecdf of predicted vs observed isualisations des GPS leave-one Out

Dir <- "/your/working/directory/"
Leave2 <- read.table(paste(Dir,"/output/Your_OutputFile.csv",sep=""),sep="\t", header=TRUE)

colnames(Leave2) <- c("ID","LonOrig","LatOrig","LonCentroid","LatCentroid","LonPred","LatPred","distance","identical","closest","K1","K2","K3","K4","K5","K6","K7","K8")


mean(Leave2$distance)
median(Leave2$distance)

toto <- ecdf(Leave2$distance)

x11()
plot(toto, xlab="distance (km)", ylab="proportion of samples", main="",
     verticals = TRUE, col.points = "blue",
     col.hor = "red", col.vert = "bisque")
abline(h=0.5, v=214, col="red", lwd=1)


##########################  Maps to compare observed and predicted locations.

newmap <- getMap(resolution = "high")

######## Mediterranean basin
x11()
plot(newmap, ylim = c(28,45),xlim=c(-10,40),axes=TRUE, col="gray95")
points(Leave2$LonOrig, Leave2$LatOrig, pch=21, bg='darkgreen' )
points(Leave2$LonPred, Leave2$LatPred, pch=21, bg='red' )

arrows(Leave2$LonOrig, Leave2$LatOrig, Leave2$LonPred, Leave2$LatPred, col = adjustcolor("darkblue",alpha.f = 0.7), code = 2, length = 0.1, angle = 20)


######################################################  detailed maps, illustrating computations ######################################################


CartographieDetaillee <- function(accession, sortie = "S", detail = 'N'){

    print(paste("accession : ",accession))
    miaou <- subset(Leave2, subset = ID %in% accession )
    
## a list with closest samples, per accession
decoupe <- function(x){

    y <- as.character(miaou[miaou$ID == x,'closest'])
    copaing2 <- strsplit(y,"/")
    blurp <- list( copain2 = copaing2)
    return(blurp)
}

voisins <- lapply(miaou$ID,decoupe)
names(voisins) <- miaou$ID

#print(voisins[[1]][[1]][1])
#voisins[[2]][[1]][1]

if (sortie == "S") {x11()} else {
    CairoPDF(
        file = paste(paste(Dir, "output/figsProvPredictor/Map-", sep = ''), accession,"_", round(miaou$distance,0), "km.pdf",sep=""),  
         width = 25, height = 15, onefile = TRUE, family = "Helvetica",
         fonts = NULL, version = "1.1",
         pointsize=14,paper="A4")
}
    
plot(newmap,ylim=c(28,45),xlim=c(-8,37),axes=TRUE, col="gray95")  # Mediterranean basin

# reported location
points(miaou$LonOrig, miaou$LatOrig, pch=21, bg='bisque', cex=2)
text(miaou$LonOrig, miaou$LatOrig, miaou$ID, cex=1, pos=c(1,2,3,4))

# centroid : 
points(miaou$LonCentroid, miaou$LatCentroid, pch=22, bg='firebrick', cex=2)

# predicted location
points(miaou$LonPred, miaou$LatPred, pch=21, bg='red', cex=2.2 )

# from reported to predicted location
arrows(miaou$LonOrig,miaou$LatOrig,miaou$LonPred,miaou$LatPred, col=adjustcolor("black",alpha.f=1 ),code=2,length=0.2,angle=20,lwd=2)

# from centroid to predicted location
arrows(miaou$LonCentroid, miaou$LatCentroid, miaou$LonPred, miaou$LatPred, col=adjustcolor("green",alpha.f=1 ), code=2, length=0.1, angle=10, lwd=2)


######## loop for samples, loop for closests  
traceVoisins <- function(x){

    x2 <- unlist(voisins[[x]][[1]]); # weird ! lapply retuen a list
#    print(x2)

    traceAccessions <- function(z,background="yellow"){
#        print(paste('Long:',Leave2[Leave2$ID==z,'LonOrig'],'Lat:',Leave2[Leave2$ID==z,'LatOrig']))

        # 'closest' locations
        points(Leave2[Leave2$ID==z,'LonOrig'], Leave2[Leave2$ID==z,'LatOrig'], pch=21, bg=background)
        text(Leave2[Leave2$ID==z,'LonOrig'], Leave2[Leave2$ID==z,'LatOrig'],Leave2[Leave2$ID==z,'ID'],cex=0.8,pos=c(1,2,3,4),offset=0.8)

        # from 'closest' to centroid 
        arrows(Leave2[Leave2$ID==z,'LonOrig'], Leave2[Leave2$ID==z,'LatOrig'], miaou[miaou$ID==names(voisins[x]),'LonCentroid'], miaou[miaou$ID==names(voisins[x]),'LatCentroid'], code=2, length=0.1, angle=20, lty=2)
    
    }
    
    if (detail=="Y") {sapply(x2,traceAccessions,background=rainbow(dim(miaou[1]))[x])} 
}


lapply(seq_along(voisins), traceVoisins)  #   THIS is the loop on accessions as arguments

    if (sortie != "S" ) {dev.off()}
    
}


############## How to use this function :

lapply(Leave2$ID[1:2], CartographieDetaillee, detail = "Y")  # uneone plot per accession

CartographieDetaillee("HM191", detail = "Y") 

CartographieDetaillee(c("HM191", "HM064"), detail = "Y") 

CartographieDetaillee(Leave2$ID[1:4])          # by default : screen and NO details. 4 accessions on the same plot


############# BEWARE !! LENGTHY !
lapply(Leave2$ID, CartographieDetaillee, detail = "Y", sortie = "P")  # one plot per accession for ALL accessions, output in pdf format