#1)Settings
#list of R packages used
library(chron)
library(readr)  
library(stringr)
library(plyr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(doBy)
library(gdata)
library(tableone)
library(ggpmisc)
library(splines)
library(lme4)
library(lmerTest)
library(effects)
library(gamm4)
library(lattice)
library(lubridate)
library(mgcv)
library(itsadug)
library(stargazer)

#2)Data import e data manipulation

#2.1-importing Boston Marathon clean dataset (S1 dataset)

#dataset already used in "Knechtle B, Di Gangi S, Rüst CA, Nikolaidis PT
#The performance differences between the sexes in the Boston Marathon from 1972 to 2017
#June 2018, Journal of Strength and Conditioning Research"
#List of variables:
#######
#YEAR= year of competition
#Name_group=Name and surname of the runner
#HOME_C=Nationality of the runner
#GENDER=Sex of the runner
#time=race time (hour:minutes:seconds)
#natio=country area of the runner
#Name_class= id variable with name, surname, nationality, sex, and period of competition
#(to uniquely identify runners)
#Num_name=numeric code for unique Name_class

library(readxl)
myfile<-"C:/Users/Stefania/Desktop/marathon_2_paper/fit/fit 2/revision/S1_dataset/S1_dataset.xlsx"
db_FM_W <- read_excel(myfile, sheet = "S1_dataset",
                      col_types = c("numeric", "text", "text", 
                                    "text", "text", "text", "text", "numeric"))
#convert time (string) into time in h:min:sec
db_FM_W$time<-chron(times=db_FM_W$time)
#NA for natio
db_FM_W$natio[db_FM_W$natio=="NA"]<-NA
#Selection groups
#near elite groups (top 101:200)
db_top101<-as.data.frame(db_FM_W %>% 
  arrange(time) %>%
  group_by(YEAR, GENDER) %>% 
  slice(101:200))
#near elite groups (top 21:100)
db_top21<-as.data.frame(db_FM_W %>% 
  arrange(time) %>%
  group_by(YEAR, GENDER) %>% 
  slice(21:100))
#annual top ten
db_topten_FM<-as.data.frame(db_FM_W %>% 
  arrange(time) %>%
  group_by(YEAR, GENDER) %>% 
  slice(1:10))
#annual winners
db_top_FM<-as.data.frame(db_FM_W %>% 
  arrange(time) %>%
  group_by(YEAR, GENDER) %>% 
  slice(1:1))


#2.2-importing weather dataset (S2 dataset)
weather<-read_excel(myfile,
                  sheet = "S2_dataset", col_types = c("numeric", 
                                    "numeric", "numeric", "text", "numeric", 
                                    "numeric", "text", "numeric", "numeric", 
                                    "numeric"))

#All finishers weather conditions: dataset with the weather conditions "averaged" over the hours of the race (9-16)
#for wind direction (Wind) we take the most frequent value over the interval
#for the amount of precipitations, we sum the hourly values over the interval
weather_avg<-as.data.frame(weather %>% 
                      group_by(YEAR) %>% 
                      summarise(count=n(),
                                #average temperature
                                temp_c=mean(Temperature_C, na.rm=T),
                                #average WBGT
                                wbgt=mean(WBGT_C, na.rm=T),
                                #most frequent wind direction
                                wind_m = names(table(Wind))[which.max(table(Wind))],
                                #sum of hourly amount of preipitations
                                prec=sum(`Precipitation (mm)`),
                                #average pressure
                                press=mean(`Pressure (hPa)`, na.rm=T),
                                #average wind speed
                                wind_sp=mean(wind_speed_km, na.rm=T)))

#Near elite groups and annual top ten weather conditions: dataset with the weather conditions "averaged" over the hours of the race (9-13)
weather_avg2<-subset(weather, as.numeric(chron(times=hour))>=0.3708333 & as.numeric(chron(times=hour))<=0.5416667)
weather_avg2<-as.data.frame(weather_avg2 %>% 
                        group_by(YEAR) %>% 
                        summarise(count=n(),
                                  temp_c=mean(Temperature_C, na.rm=T),
                                  wbgt=mean(WBGT_C, na.rm=T),
                                  wind_m = names(table(Wind))[which.max(table(Wind))],
                                  prec=sum(`Precipitation (mm)`),
                                  press=mean(`Pressure (hPa)`, na.rm=T),
                                  wind_sp=mean(wind_speed_km, na.rm=T)))

#Annual winners weather conditions: dataset with the weather conditions "averaged" over the hours of the race (9-12)
weather_avg3<-subset(weather, as.numeric(chron(times=hour))>=0.3708333 & as.numeric(chron(times=hour))<=0.5)
weather_avg3<-as.data.frame(weather_avg3 %>% 
                        group_by(YEAR) %>% 
                        summarise(count=n(),
                                  temp_c=mean(Temperature_C, na.rm=T),
                                  wbgt=mean(WBGT_C, na.rm=T),
                                  wind_m = names(table(Wind))[which.max(table(Wind))],
                                  prec=sum(`Precipitation (mm)`),
                                  press=mean(`Pressure (hPa)`, na.rm=T),
                                  wind_sp=mean(wind_speed_km, na.rm=T)))

#2.3-create complete dataset (weather and marathon performance together)

#All finishers
db_FM_W<-join(db_FM_W, subset(weather_avg, select=c(-count)))
#recode wind direction (head/tail/side wind)
db_FM_W$wind_dir<-str_sub(db_FM_W$wind_m,1,1)
db_FM_W$wind_dir[db_FM_W$wind_dir=="W"]<-"tail wind"
db_FM_W$wind_dir[db_FM_W$wind_dir=="E"]<-"head wind"
db_FM_W$wind_dir[db_FM_W$wind_dir=="S"|db_FM_W$wind_dir=="N"]<-"side wind"

#Top 101:200
db_top101<-join(db_top101, subset(weather_avg2, select=c(-count)))
#recode wind direction (head/tail/side wind)
db_top101$wind_dir<-str_sub(db_top101$wind_m,1,1)
db_top101$wind_dir[db_top101$wind_dir=="W"]<-"tail wind"
db_top101$wind_dir[db_top101$wind_dir=="E"]<-"head wind"
db_top101$wind_dir[db_top101$wind_dir=="S"|db_top101$wind_dir=="N"]<-"side wind"

#Top 21:100
db_top21<-join(db_top21, subset(weather_avg2, select=c(-count)))
#recode wind direction (head/tail/side wind)
db_top21$wind_dir<-str_sub(db_top21$wind_m,1,1)
db_top21$wind_dir[db_top21$wind_dir=="W"]<-"tail wind"
db_top21$wind_dir[db_top21$wind_dir=="E"]<-"head wind"
db_top21$wind_dir[db_top21$wind_dir=="S"|db_top21$wind_dir=="N"]<-"side wind"

#Top 10
db_topten_FM<-join(db_topten_FM, subset(weather_avg2, select=c(-count)))
#recode wind direction (head/tail/side wind)
db_topten_FM$wind_dir<-str_sub(db_topten_FM$wind_m,1,1)
db_topten_FM$wind_dir[db_topten_FM$wind_dir=="W"]<-"tail wind"
db_topten_FM$wind_dir[db_topten_FM$wind_dir=="E"]<-"head wind"
db_topten_FM$wind_dir[db_topten_FM$wind_dir=="S"|db_topten_FM$wind_dir=="N"]<-"side wind"

#Annual winners
db_top_FM<-join(db_top_FM, subset(weather_avg3, select=c(-count)))
#recode wind direction (head/tail/side wind)
db_top_FM$wind_dir<-str_sub(db_top_FM$wind_m,1,1)
db_top_FM$wind_dir[db_top_FM$wind_dir=="W"]<-"tail wind"
db_top_FM$wind_dir[db_top_FM$wind_dir=="E"]<-"head wind"
db_top_FM$wind_dir[db_top_FM$wind_dir=="S"|db_top_FM$wind_dir=="N"]<-"side wind"

#3) Statistical analysis
#defining factors and reference categories for categorical predictors
db_FM_W$GENDER<-factor(db_FM_W$GENDER)
db_top_FM$GENDER<-factor(db_top_FM$GENDER)
db_topten_FM$GENDER<-factor(db_topten_FM$GENDER)
db_top101$GENDER<-factor(db_top101$GENDER)
db_top21$GENDER<-factor(db_top21$GENDER)

db_FM_W$natio<-relevel(factor(db_FM_W$natio), ref="KEN-ETH")
db_topten_FM$natio<-relevel(factor(db_topten_FM$natio), ref="KEN-ETH")
db_top21$natio<-relevel(factor(db_top21$natio), ref="KEN-ETH")

db_FM_W$wind_dir<-relevel(factor(db_FM_W$wind_dir), ref="tail wind")
db_top_FM$wind_dir<-relevel(factor(db_top_FM$wind_dir), ref="tail wind")
db_topten_FM$wind_dir<-relevel(factor(db_topten_FM$wind_dir), ref="tail wind")
db_top101$wind_dir<-relevel(factor(db_top101$wind_dir), ref="tail wind")
db_top21$wind_dir<-relevel(factor(db_top21$wind_dir), ref="tail wind")

#defining country groups for annual winners
db_top_FM$natio2<-as.vector(db_top_FM$natio)
db_top_FM$natio2[db_top_FM$natio2 %in% c("Asia", "Canada", "Oceania")]<-"Other"
db_top_FM$natio2<-relevel(factor(db_top_FM$natio2), ref="KEN-ETH")

#GAMM models
#All finishers
fit_g1<- gamm4(time ~ s(YEAR, by=GENDER)+natio*GENDER + temp_c + wbgt + prec + press + wind_sp + wind_dir, random=~ (1|Num_name), data=db_FM_W, family= gaussian)
#Annual winners
fit_g2<- gamm4(time ~ s(YEAR, by=GENDER)+natio2*GENDER + temp_c +prec + press + wind_sp + wind_dir, random=~ (1|Num_name), data=db_top_FM, family= gaussian)
#Top 101:200
fit_g3<- gamm4(time ~ s(YEAR, by=GENDER) + GENDER + temp_c + wbgt + prec+press + wind_sp + wind_dir, random=~ (1|Num_name), data=db_top101, family= gaussian)
#Top 21:100
fit_g4<- gamm4(time ~ s(YEAR, by=GENDER)+natio+GENDER + temp_c + wbgt + prec+ press + wind_sp + wind_dir, random=~ (1|Num_name), data=db_top21, family= gaussian)
#Top 10
fit_g5<- gamm4(time ~ s(YEAR, by=GENDER)+natio+GENDER + temp_c + wbgt + prec + press + wind_sp + wind_dir, random=~ (1|Num_name), data=db_topten_FM, family= gaussian)