R and SQL code used to query the penguin population database (www.penguinmap.com) and convert these tables into R objects used in the Adélie population model (ver. 1.2 in www.penguinmap.com).
# Query MAPPPD for Adelie model data
# Arrange Adelie data for input into jags
library("RPostgreSQL")
library("dplyr")
#############################################################################################################
# create a connection to the MAPPPD database http://www.r-bloggers.com/getting-started-with-postgresql-in-r/
rm(list = ls())
drv <- dbDriver("PostgreSQL")
mapppd.dbname <- "mapppd"
mapppd.dbhost <- "mapppd.cbmvud2go9ni.us-east-1.rds.amazonaws.com"
mapppd.dbuser <- "mapppd_public"
mapppd.dbpwrd <- "p3nguins2017"
con <- dbConnect(drv, dbname = mapppd.dbname, host = mapppd.dbhost, port = 5432, user = mapppd.dbuser, password = mapppd.dbpwrd)
#############################################################################################################
# Query MAPPPD for data for model
# Keep all positive adelie counts from 1982-2015 (ASSUMPTION 1) except one Avian Island count which has a very large discrepancy from the other counts at that site during similar time period
# (ASSUMPTION 2) Keep maximum count for each site, season, (adult & nest)/chick combination
# Should be 1224 rows
sql <- "
select distinct maxcount.site_id, maxcount.season, maxcount.chick, adelielist.count_type, maxcount.count, adelielist.accuracy from
(select site_id, season, case when count_type = 'nests' or count_type = 'adults' then 0 else 1 end as chick, max(count) as count from preprocessed
where season >= 1982 and season <= 2015 and common_name='ad{\\''e}lie penguin' and count > 0 and not preprocessed_id = 192
group by site_id, season, chick) maxcount
inner join
(select site_id, season, case when count_type = 'nests' or count_type = 'adults' then 0 else 1 end as chick, count_type, accuracy, count from preprocessed
where season >= 1982 and season <= 2015 and common_name='ad{\\''e}lie penguin' and count > 0 and not preprocessed_id = 192) adelielist
on maxcount.site_id = adelielist.site_id and maxcount.season = adelielist.season and maxcount.chick = adelielist.chick and maxcount.count = adelielist.count
order by site_id, season, chick;"
abundance <- dbGetQuery(con, sql)
dim(abundance)
# Keep all presence/absence data from 1982-2015 (ASSUMPTION 3) except for absence data made via vhr as this could be unreliable
# Keep only sites that also have at least one adelie abundance count between 1982-2015
# Should be 1166 rows
sql <- "
select adelielist.site_id, adelielist.season, adelielist.presence from
(select distinct site_id from preprocessed
where season >= 1982 and season <= 2015 and count > 0 and common_name='ad{\\''e}lie penguin' and not preprocessed_id = 192) abundancelist
inner join
(select site_id, season, max(presence) as presence from preprocessed
where season >= 1982 and season <= 2015 and common_name='ad{\\''e}lie penguin' and not (vantage = 'vhr' and presence = 0) and not preprocessed_id = 192
group by site_id, season) adelielist
on abundancelist.site_id = adelielist.site_id
order by site_id, season;"
occupancy <- dbGetQuery(con, sql)
dim(occupancy)
# Keep only sites that have at least one adelie abundance count between 1982-2015
# Should be 267 sites
sql <- "
select sitelist.site_id, sitelist.site_name, sitelist.region, sitelist.ccamlr_id from
(select distinct site_id from preprocessed
where season >= 1982 and season <= 2015 and count > 0 and common_name='ad{\\''e}lie penguin') abundancelist
inner join
(select site_id, site_name, region, ccamlr_id from site) sitelist
on abundancelist.site_id = sitelist.site_id
order by site_id;"
SiteList <- dbGetQuery(con, sql)
dim(SiteList)
# Keep only covariates for sites that have at least one adelie abundance count between 1982-2015
# Should be 18156 rows, 267 sites x 34 years x 2 covariates
sql <- "
select covlist.site_id, covlist.season, covlist.month_start, covlist.month_stop, covlist.lag_start, covlist.lag_stop, covlist.statistic, covlist.radius, covlist.value from
(select distinct site_id from preprocessed
where season >= 1982 and season <= 2015 and count > 0 and common_name='ad{\\''e}lie penguin') abundancelist
inner join
(select site_id, season, radius, statistic, month_start, month_stop, lag_start, lag_stop, value from sitecovariates
where season >= 1982 and season <= 2015 and statistic = 'max' and ((radius = 500 and month_start = 6 and month_stop = 9 and lag_start = 0 and lag_stop = 4)
or (radius = 500 and month_start = 11 and month_stop = 1 and lag_start = 4 and lag_stop = 4))) covlist
on abundancelist.site_id = covlist.site_id
order by site_id, season;"
sitecov <- dbGetQuery(con, sql)
dim(sitecov)
###################################################################################################################
# Merge pre-determined observations to drop for data validation purposes
# perform no data validation this run
dim(abundance)
DataValidationRows <- read.csv("DataValidationRows.csv", header = TRUE)
abundance <- merge(abundance, DataValidationRows, by = c("site_id", "season", "count", "count_type", "accuracy"), all.x = TRUE)
dim(abundance)
i <- which(is.na(abundance$validation==TRUE)); abundance$validation[i] <- 0
abundance$validation <- 0
###################################################################################################################
# Some more assumptions
# (ASSUMPTION 4) We convert adults counts to nest counts 1:1, while increasing the error by + 3 with a max error of 5 for such counts
adults <- which(abundance$count_type == "adults")
for (i in 1:length(adults)) {
abundance$accuracy[adults[i]] <- min(abundance$accuracy[adults[i]] + 3, 5)
}
# (ASSUMPTION 5) convert accuracy to the following errors/precisions, see supplement 1 for details regarding this
abundance$precision <- NA
i <- which(abundance$accuracy == 1); abundance$precision[i] <- 1/0.02490061^2
i <- which(abundance$accuracy == 2); abundance$precision[i] <- 1/0.04955838^2
i <- which(abundance$accuracy == 3); abundance$precision[i] <- 1/0.1201131^2
i <- which(abundance$accuracy == 4); abundance$precision[i] <- 1/0.2212992^2
i <- which(abundance$accuracy == 5); abundance$precision[i] <- 1/0.4472728^2
###################################################################################################################
# make sea ice sitecovariates wide not long
sitecov2 <- subset(sitecov, month_start==11 & month_stop==1 & lag_start==4 & lag_stop==4, select = c(site_id, season, value))
names(sitecov2) <- c("site_id", "season", "sic_11_1_max_l4_500")
sitecov3 <- subset(sitecov, month_start==6 & month_stop==9 & lag_start==0 & lag_stop==4, select = c(site_id, season, value))
names(sitecov3) <- c("site_id", "season", "sic_6_9_max_l0_l4_500")
sitecov4 <- merge(sitecov2, sitecov3, by = c("site_id", "season"))
sitecov <- subset(sitecov4, select = c("site_id", "season", "sic_11_1_max_l4_500", "sic_6_9_max_l0_l4_500"))
###################################################################################################################
# Indexing
# Construct site index
abundance$site <- as.numeric(as.factor(abundance$site_id))
occupancy$site <- as.numeric(as.factor(occupancy$site_id))
SiteList$site <- as.numeric(as.factor(SiteList$site_id))
SiteList$ccamlr <- as.numeric(as.factor(SiteList$ccamlr_id))
sitecov$site <- as.numeric(as.factor(sitecov$site_id))
# Construct relative season index
min_season <- 1982
abundance$season_relative <- abundance$season - min_season + 1
occupancy$season_relative <- occupancy$season - min_season + 1
sitecov$season_relative <- sitecov$season - min_season + 1
# Compute visit index
abundance$visit <- 1
for (i in 2:dim(abundance)[1]) {
if (abundance$site[i] == abundance$site[i - 1] & abundance$season[i] == abundance$season[i - 1]) abundance$visit[i] <- abundance$visit[i - 1] + 1
}
# Compute the number of sites
(n_sites <- length(unique(abundance$site)))
# Compute the number of seasons
(n_seasons <- (max(abundance$season) - min_season) + 2)
# Compute the number of counts
abundance_observed <- subset(abundance, validation==0)
abundance_predict <- subset(abundance, validation==1)
(n <- dim(abundance_observed)[1])
# (n_p <- dim(abundance_predict)[1])
# These vectors allow the model to select only the non-NA non-validation 1982 - 2015 tuples from the data cubes created below for estimation
(site <- as.vector(abundance_observed$site))
(season <- as.vector(abundance_observed$season_relative))
(visit <- as.vector(abundance_observed$visit))
# These vectors allow the model to select only the non-NA validation tuples from the data cubes created below for validation
# (site_p <- as.vector(abundance_predict$site))
# (season_p <- as.vector(abundance_predict$season_relative))
# (visit_p <- as.vector(abundance_predict$visit))
# Compute the initial and final seasons that we have count data for each site
abundance2 <- tbl_df(abundance) %>%
group_by(site) %>%
summarise('initial' = min(season_relative), 'final' = max(season_relative))
# This vector allows the model to find the initial year of data for each site
(initial <- as.vector(abundance2$initial))
dim(abundance); abundance <- merge(abundance, abundance2, by = "site"); dim(abundance)
###################################################################################################################
# Some more assumptions
# (ASSUMPTION 6) If the initial year count is a chick count we convert chicks to nests using the model hyperpriors
i <- which(abundance$initial==abundance$season_relative & abundance$count_type=="chicks" & abundance$visit==1)
length(i)
abundance$count_type[i] <- "nests"
abundance$chick[i] <- 0
abundance$count[i] <- round(abundance$count[i] / .90)
###################################################################################################################
# Extend SIC covariates as five year average in order to make predictions in 2015, or t + 1
# This step won't be necessary once we can query MAPPPD for 2015 SIC
# Scale covariates
sitecov2 <- tbl_df(sitecov) %>%
filter(season >= 2011, season <= 2015) %>%
group_by(site_id, site) %>%
summarise('sic_6_9_max_l0_l4_500' = mean(sic_6_9_max_l0_l4_500), 'sic_11_1_max_l4_500' = mean(sic_11_1_max_l4_500)) %>%
mutate(season = 2016, season_relative = n_seasons)
sitecov3 <- tbl_df(sitecov) %>%
filter(season < 2016) %>%
select(site_id, season, site, season_relative, sic_6_9_max_l0_l4_500, sic_11_1_max_l4_500)
sitecov4 <- as.data.frame(rbind(as.data.frame(sitecov2), as.data.frame(sitecov3)))
sitecov5 <- sitecov4[order(sitecov4$site_id, sitecov4$season), ]
wsicMean <- mean(sitecov5$sic_6_9_max_l0_l4_500)
wsicSD <- sd(sitecov5$sic_6_9_max_l0_l4_500)
ssicMean <- mean(sitecov5$sic_11_1_max_l4_500)
ssicSD <- sd(sitecov5$sic_11_1_max_l4_500)
sitecov5$sic_6_9_max_l0_l4_500 <- as.numeric(scale(sitecov5$sic_6_9_max_l0_l4_500))
sitecov5$sic_11_1_max_l4_500 <- as.numeric(scale(sitecov5$sic_11_1_max_l4_500))
###################################################################################################################
# Create the data list for the JAGS model
# Make data cubes y, precision, and chick with dimensions [n_sites, n_seasons, 2] and matrices w and check with dimensions [n_sites, n_seasons]
chick <- y <- precision <- array(NA, dim = c(n_sites, n_seasons, 2))
check <- array(0, dim = c(n_sites, n_seasons, 2))
w <- array(NA, dim = c(n_sites, n_seasons))
for (i in 1:n_sites) {
for (j in 1:n_seasons) {
x1 <- subset(occupancy, site == i & season_relative == j)
if (dim(x1)[1] > 0) {
w[i, j] <- x1$presence
}
for (k in 1:2) {
x2 <- subset(abundance, site == i & season_relative == j & visit == k)
if (dim(x2)[1] > 0) {
y[i, j, k] <- x2$count
precision[i, j, k] <- x2$precision
chick[i, j, k] <- x2$chick
check[i, j, k] <- dim(x2)[1]
}
}
}
}
# (ASSSUMPTION 7) We assume no colonization for any site from 1982 - 2014 and that all sites were colonized in 1982
# This allows us to fill in NAs between presence counts and between the sart year and the first presence count with 1's
for (i in 1:n_sites) {
for (j in n_seasons:2) {
if (is.na(w[i, j]) == FALSE) {
if (w[i, j] == 1)
w[i, (j - 1)] <- 1
}
}
}
# Make matrices for covariates with dimensions [n_sites, n_seasons]
sic_6_9_max_l0_l4_500 <- sic_11_1_max_l4_500 <- array(NA, dim = c(n_sites, n_seasons))
for (i in 1:n_sites) {
for (j in 1:n_seasons) {
x3 <- subset(sitecov5, site == i & season_relative == j)
sic_6_9_max_l0_l4_500[i, j] <- x3$sic_6_9_max_l0_l4_500
sic_11_1_max_l4_500 [i, j] <- x3$sic_11_1_max_l4_500
}
}
# The max check should be 1, meaning the subsetting only found one row per site, season, and visit
max(check)
DataQuery <- list(
y = y,
w = w,
n_sites = n_sites,
n_seasons = n_seasons,
initial = initial,
n = n,
site = site,
season = season,
visit = visit,
precision = precision,
chick = chick,
wsic = sic_6_9_max_l0_l4_500,
ssic = sic_11_1_max_l4_500)
# n_p = n_p,
# site_p = site_p,
# season_p = season_p,
# visit_p = visit_p)
# These frames are needed for generating the supplement
abundance5 <- tbl_df(abundance) %>%
filter(season <= 2016) %>%
group_by(site) %>%
summarise('initial' = min(season_relative), 'final' = max(season_relative))
SiteList <- merge(SiteList, abundance5, by = "site")
SiteList$seasons <- SiteList$final - SiteList$initial + 1
yframe <- abundance
StandardizationValues <- list(
wsicMean = wsicMean,
wsicSD = wsicSD,
ssicMean =ssicMean,
ssicSD <- ssicSD)
# Determine number of observed colonizaton and extinction events
dat <- data.frame(w)
dat.new <- do.call(rbind, lapply(1:nrow(dat), function(x) t(matrix(dat[x, order(is.na(dat[x,]))]))))
colnames(dat.new) <- colnames(dat)
dat.new <- data.frame(dat.new)
dat.new[is.na(dat.new)] <- 5
colonizations <- extinctions <- rep(0, dim(dat.new)[1])
for(i in 1:dim(dat.new)[1]){
for(j in 2:dim(dat.new)[2]){
if (dat.new[i, (j-1)]==1 & dat.new[i, j]==0) extinctions[i] <- extinctions[i] + 1
if (dat.new[i, (j-1)]==0 & dat.new[i, j]==1) colonizations[i] <- colonizations[i] + 1
}
}
table(colonizations)
table(extinctions)
keep <- c("DataQuery", "SiteList", "yframe", "StandardizationValues")
ls()[!(ls() %in% keep)]
rm(list = ls()[!(ls() %in% keep)])
save(DataQuery, file = "DataQuery.rda")
save(SiteList, file = "SiteList.rda")
save(yframe, file = "yframe.rda")
save(StandardizationValues, file = "StandardizationValues.rda")