################################################################################ # # # This set of R functions can be used to perform a k-means clustering in which # # initial seeds are defined by a hierarchical clustering. It is based on the # # kmeans function in the package . # # # # It has two parts: # # Part I - Definition of the functions # # Part II - Application on the dataset given in Appendix S4 # # # # List of functions: # # 1) Standardization: To standardize values of a vector or a data frame. # # 2) euclidean.dist: To calculate Euclidean distance between two points in # # a multidimensional space (>= 2). # # 3) K.means: To perform a k-means clustering in which initial seeds are # # defined by a hierarchical clustering (hclust function). # # This function also allows to select the optimum number of # # clusters based on the inflexion point of the Rsq profile (see # # Appendix S2 for further details on the method . # # User has to define several options: # # DIST - The distance metric to compute on data before # # performing the hclust algorithm, # # METHOD - The agglomerative criterion to produce the # # hierarchical clustering # # STD - (logical) To standardize data before performing # # clustering # # NITER - Number of iterations for the k-means algorithm # # (see ?kmeans for further details) # # 4) NN.kmeans: To select one object by cluster. Based on the minimum # # distance between cluster center and objects of this cluster # # User has to define several options: # # KMEANS: Result of the function K.means # # LABELS: Vector with the name of each object # # STD - If TRUE, data are standardized # # # # This R script is linked to the following paper: # # # # Casajus, N., Perie, C., Lambert, M.-C., de Blois, S. & Berteaux, D. # # An objective approach to select climate scenarios when projecting # # species distribution under climate change. Submitted to PlosONE 2015. # # # ################################################################################ # I) FUNCTIONS DEFINITION #----------------------------------- Standardization <- function(data){ if (is.null(dim(data))){ data <- (data-mean(data))/sd(data) }else{ for (i in 1 : ncol(data)) data[,i] <- (data[,i]-mean(data[,i]))/sd(data[,i]) } return(data) } euclidean.dist <- function(x1,x2){ d <- 0 for (i in 1 : length(x1)) d <- d + (x1[i] - x2[i])^2 return(sqrt(d)) } K.means <- function(data,dist = "euclidean",method = "ward",niter = 999,Std = T){ if (Std) data <- Standardization(data) # Distance matrix tsid <- dist(data,method = dist) # Hierarchical clustering to define initial seeds for k-means tsul <- hclust(tsid,method = method) # K-means from 2 to n groups GRP <- list() for (i in 2 : (nrow(data)-1)){ centres <- cutree(tsul,i) Kcoor <- as.data.frame(matrix(nrow = i,ncol = ncol(data))) colnames(Kcoor) <- colnames(data) for (j in 1 : i){ pos <- which(centres == j) Kcoor[j,] <- apply(data[pos,],2,mean) } GRP[[i]] <- kmeans(data,centers = Kcoor,iter.max = niter) } # Rsq calculation for each clustering rsq <- 0 for (i in 2 : length(GRP)){ ssw <- sum(GRP[[i]]$"withinss") xmean <- apply(data,2,mean) centers <- rbind(GRP[[i]]$centers,xmean) bss <- as.matrix(dist(centers)^2) ssb <- sum(as.vector(bss[nrow(bss),])*c(GRP[[i]]$size,0)) rsq[i] <- 1-(ssw/(ssw+ssb)) } rsq <- c(rsq,1) # Select optimal number of groups x <- as.data.frame(matrix(c(1,0,round(length(rsq)/2),0.5,27,1),byrow = T,nrow = 3,ncol= 2)) colnames(x) <- c("Grp","Rsq") mod <- lm(Rsq ~ Grp,data = x) benef <- NULL for (i in 1 : length(rsq)) benef <- c(benef,mod$coef[[1]] + mod$coef[[2]]*i) plot(rsq,type = "b",pch = 19,axes = F,cex.lab = 0.75,font.lab = 2, xlab = "Number of groups",ylab = "RSQ",main = "") axis(2,seq(0,1,0.1),seq(0,1,0.1),cex.axis = 0.75,las = 2,pos = 1) axis(1,seq(1,length(rsq),2),seq(1,length(rsq),2),cex.axis = 0.75,las = 1,pos = 0) points(c(1 : length(rsq)),benef,type = "b",pch = 19) points(c(1,length(rsq)),c(0,1),type = "l",lty = 1,lwd = 1) points(c(1 : length(rsq)),rsq-benef,type = "b",pch = 19) points(x = c(which.max(rsq-benef),which.max(rsq-benef)), y = c(0,rsq[which.max(rsq-benef)]),lty = 2,type = "l",col = "red") points(x = c(1,which.max(rsq-benef)), y = c(rsq[which.max(rsq-benef)],rsq[which.max(rsq-benef)]),lty = 2,type = "l",col = "red") points(x = which.max(rsq-benef),y = rsq[which.max(rsq-benef)],type = "b",pch = 19,col = "red") points(x = which.max(rsq-benef),y = (rsq-benef)[which.max(rsq-benef)],type = "b",pch = 19,col = "red") text(x = which.max(rsq-benef)+0.5,y = 0.025,labels = which.max(rsq-benef),col = "red",font = 2) text(x = 2.1,y = rsq[which.max(rsq-benef)]+0.025,labels = round(rsq[which.max(rsq-benef)],2),col = "red",font = 2) return(GRP[[which.max(rsq-benef)]]) } NN.kmeans <- function(data,kmean,labels,Std = T){ if (Std) data <- Standardization(data) Res <- as.data.frame(matrix(nrow = nrow(kmean$"centers"),ncol = 3)) colnames(Res) <- c("Object","Cluster","n") for (i in 1 : nrow(kmean$"centers")){ pos <- which(kmean$"cluster" == i) z <- apply(data[pos,],1,euclidean.dist,x2 = kmean$"centers"[i,]) Res[i,"Object"] <- labels[pos[which.min(z)]] Res[i,"Cluster"] <- i Res[i,"n"] <- length(pos) } return(Res) } # II) APPLICATION #----------------------------------- # Importing data (tab <- read.delim("Casajus_etal_AppendixS4.txt")) # Performing k-means (km <- K.means(data = tab[,-1],dist = "euclidean",method = "ward",niter = 999,Std = T)) # Selecting statistical individu the nearest of the cluster center (sel <- NN.kmeans(data = tab[,-1],kmean = km,labels = as.character(tab[,1]),Std = T))