###################################################################################################################################
# Correlation analysis: R script for the paper "Machine learning approaches identify male body size as the most accurate predictor of species richness"
# Species richness vs. minimum male body size (both variables are numeric) 
###################################################################################################################################
library(ggpubr)

#set your working directory
#export RF excel sheet from the Additional file 2 as comma-separated values

corrALL <- read.csv("Additional file 2 - RF.csv", sep=";", stringsAsFactors = T)
corrALL <- data.frame(corrALL[,-1], row.names = corrALL[,1])

correlation.data <- corrALL[,c(9,27)]

shapiro.test(correlation.data$Species.richness.numeric) 
shapiro.test(correlation.data$Body.Male.MIN)

ggqqplot(correlation.data$Species.richness.numeric, ylab = "Number of species in a genus")
ggqqplot(correlation.data$Body.Male.MIN, ylab = "Minimal male body size [mm]")

cor.test(correlation.data$Species.richness.numeric, correlation.data$Body.Male.MIN, method="spearman")

p <- ggscatter(correlation.data, x = "Species.richness.numeric", y = "Body.Male.MIN", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "spearman", cor.coef.coord = c(280,55),
          xlab = "Number of species within a spider genus", ylab = "Minimal male body size in a spider genus",
          ylim = c(0,58))

p + scale_y_continuous(expand = c(0, 0))