# Functions for data provided NOAA Ecosystem Sciences Division # Stationary point count survey data collected for the NOAA Pacific Reef Assessment and Monitoring Program # 2010-2016 only # anonymous diver and cleaned data to be shared publically ## To cite data: ## Heenan, A., Williams, I., Acoba, T., DesRochers, A., Kosaki, R., Kanemura, T., Nadon, M., Brainard, R. Figshare https://doi.org/10.6084/m9.figshare.c.3808039(2017) ## To cite paper: ## Heenan, A., Williams, I., Acoba, T., DesRochers, A., Kosaki, R., Kanemura, T., Nadon, M., Brainard, R. (2017) Long term monitoring of coral reef fish assemblages in the Western Central Pacific. Scientific Data ### ------------------------------------------------------------- ### GENERATE SITE SURVEY METADATA FROM REP LEVEL DATA # To use: Aggregate_InputTable(datafile, list of fields to be included) # Returns dataframe of site level survey data ### ------------------------------------------------------------- Aggregate_InputTable<-function(x, field_list){ # function assumes that x is a data frame looking like our standard input # field_list is the list of fields to include (could be verything relating to each survey, or everything relating to a fish species) # function returns a data frame y<-aggregate(x$COUNT,by=x[,field_list], sum) # aggregate sums total count of all fishes per record, using field_list y<-y[,field_list] # drop the count - was just using that to generate a summary table return(y) } # End Aggregate_InputTables ### ------------------------------------------------------------- ### FILL MISSING DATA FIELDS (NAs) TO AVERAGE OF NON- NA values # When a diver failed to record benthic or survey data we set it to the mean value from the other diver in the REP # Support function for Calc_Site_nSurveysArea functions # Will not be called directly by user ### ------------------------------------------------------------- fill.NA.count<- function(x, rep_fields, survey_data_fields) { all.reps<-aggregate(x, by=x[,rep_fields], length) #go through rep by rep, setting NAs to average of non-NA values for(i in 1:dim(all.reps)[1]){ good.data<-array(NA,dim=length(survey_data_fields)) yy<-x for(j in 1:length(rep_fields)){ yy<-subset(yy, yy[,rep_fields[j]]==all.reps[i,rep_fields[j]]) } #determine the means of values with data good.data<-colMeans(yy[,survey_data_fields], na.rm=TRUE) n.survey.fields<-length(survey_data_fields) for(k in 1:n.survey.fields){ yy[is.na(yy[,survey_data_fields[k]]), survey_data_fields[k]]<-good.data[k] } #now write yy back into x n.y.rows<-dim(yy)[1] for(m in 1:n.y.rows){ x[x$REPLICATEID == yy$REPLICATEID[m],survey_data_fields]<-yy[m ,survey_data_fields] } } return(x) } ### ------------------------------------------------------------- ### GENERATE REP LEVEL COMPLEXITY FROM HEIGHT BIN ESTIMATES ## To use: CalcMeanSEMeanSHDiff(datafile) ## returns a list with 3 elements MEAN_SH, MEAN_SH_DIFF, SD_SH_DIFF ## MEAN_SH is the weighted mean of the substrate height categories .. SUBSTRATE_HEIGHTS_xx used to weight the mean of each height class (lowest class: 0->20cm mean:10cm; next:0-50cm mean:35cm; for largest class (>150cm), mean is average of 150cm and MAX_HEIGHT) ## MEAN_SH_DIFF is a measure of variability in height. In this case it is a weighted mean of the difference between the height class averages and the MEAN_SH weighted in the same way as above thus if mean height is 0.43m, then substrate in the 0-20cm category (mean height of this class is 10cm) is 0.33m below the mean, substrate in the 20-50cm height class is 0.08m below the mean (mean is 0.35m), etc.. ## MEAN_SH_DIFF is the weighted means of the absolute values of the differences in average substrate heights, again weighted by the amount of substrate in each substrate height class ### ------------------------------------------------------------- CalcMeanSHMeanSHDiff<-function(xx) { #temp function to convert a string to a numeric and return 0 where it was NA clean_vals <- function(vals) { vals<-as.double(vals) vals[is.na(vals)]<-0 return(vals) } #end clean_vals #first create a field to determine whether there is substrate hight info xx$SH_DATA_EXISTS<-TRUE xx$TOT_SH<-rowSums(xx[,c("SUBSTRATE_HEIGHT_0", "SUBSTRATE_HEIGHT_20", "SUBSTRATE_HEIGHT_50", "SUBSTRATE_HEIGHT_100", "SUBSTRATE_HEIGHT_150")]) if(length(xx[is.na(xx$TOT_SH),]$SH_DATA_EXISTS)>0) xx[is.na(xx$TOT_SH),]$SH_DATA_EXISTS<-FALSE xx$MAX_HEIGHT <-clean_vals(xx$MAX_HEIGHT) xx$SUBSTRATE_HEIGHT_0<-clean_vals(xx$SUBSTRATE_HEIGHT_0) xx$SUBSTRATE_HEIGHT_20<-clean_vals(xx$SUBSTRATE_HEIGHT_20) xx$SUBSTRATE_HEIGHT_50<-clean_vals(xx$SUBSTRATE_HEIGHT_50) xx$SUBSTRATE_HEIGHT_100<-clean_vals(xx$SUBSTRATE_HEIGHT_100) xx$SUBSTRATE_HEIGHT_150<-clean_vals(xx$SUBSTRATE_HEIGHT_150) #create simpler substrate_height df to work with sh<-xx[,c("SUBSTRATE_HEIGHT_0", "SUBSTRATE_HEIGHT_20", "SUBSTRATE_HEIGHT_50", "SUBSTRATE_HEIGHT_100", "SUBSTRATE_HEIGHT_150")] #SUBSTRATE_HEIGHTS_XX should total to either 0 or 100 - next line checks that sh.tot<-rowSums(sh) num.bad<-length(sh[!sh.tot %in% c(0,100)]) if (num.bad>0) print("Records with substrate not adding to 0 or 100 is", num.bad) n_records<-length(sh.tot) #create temp array to save weighting values for the substrate height totals mh<-sh mh$SUBSTRATE_HEIGHT_0<-0.1 mh$SUBSTRATE_HEIGHT_20<-0.35 mh$SUBSTRATE_HEIGHT_50<-0.75 mh$SUBSTRATE_HEIGHT_100<-1.25 mh$SUBSTRATE_HEIGHT_150<-apply(data.frame(rep(1.5,n_records), xx$MAX_HEIGHT),1,mean) #calculate the mean by multiplying the weighting (sh) by they height mid-points MEAN_SH<-rowSums(sh*mh)/100 #NOW CALCULATE THE MEAN DIFF FROM THE MEAN sh_diff<- abs(mh - MEAN_SH) MEAN_SH_DIFF<-rowSums(sh_diff*sh)/100 SD_SH_DIFF<-sqrt(rowSums((sh_diff^2)*(sh/100))) MEAN_SH[!xx$SH_DATA_EXISTS]<-NaN MEAN_SH_DIFF[!xx$SH_DATA_EXISTS]<-NaN SD_SH_DIFF[!xx$SH_DATA_EXISTS]<-NaN list1<-list(MEAN_SH, MEAN_SH_DIFF, SD_SH_DIFF) names(list1)<-c("MEAN_SH", "MEAN_SH_DIFF", "SD_SH_DIFF") return(list1) } # End CalcMeanSHMeanSHDiff ### ------------------------------------------------------------- ### GENERATE SITE DEPTH AND BENTHIC INFO FROM REP LEVEL DATA # To use: Calc_Site_nSurveysArea(datafile, survey_id_fields, rep_fields, count_fields, survey_data_fields) # survey_data_fields generally will be depth, coral cover, complexity etc...) # returns dataframe with SITEVISITID, METHOD and SITE survey data fields # nCounts = number of individual cylinder surveys per site nREPS = number paired cylinders per site ### ------------------------------------------------------------- Calc_Site_nSurveysArea<-function(x, survey_id_fields, rep_fields, count_fields, survey_data_fields){ # function assumes that x is a data frame with at least the fields mentioned in parameters survey_fields to survey_data_fields # survey_id_fields are fields that identify unique site-survey (now .. SiteVisitId and Method) # rep_fields are fields that identify the replicate BLT transect or nSPC pair (generally survey_id_fields plus "A","B","C") # count_fields are fields needed to sum/mean by the specific count (e.g. nSPC cont as part of a SPC-pair, or one side of a single transect) # survey_data_fields are the fields that we want to get mean oer survey eventually (after pooling up in this function) # function returns a data frame with survey_id_fields, and means of all the data columns RETURN_FIELDS<-c("nCounts", "nReps") #first average survey_data_fields for all replicate-counts y<-aggregate(x[,survey_data_fields],by=x[,count_fields], mean) #idw if a rep has NAs in the data fields, then try to set them to the average of other counts in that rep y<-fill.NA.count(y, rep_fields, survey_data_fields) y$nCounts<-1 y$nReps<-1 names(y)<-c(count_fields, survey_data_fields, RETURN_FIELDS) #pool by Rep ("A","B","C" generally), then by survey (i.e. by SiteVisitID and Method) idx_first_data_field<-length(count_fields)+1 z<-aggregate(y[,idx_first_data_field:dim(y)[2]],by=y[,rep_fields], sum) y<-aggregate(y[,idx_first_data_field:dim(y)[2]],by=y[,rep_fields], mean) y$nCounts<-z$nCounts idx_first_data_field<-length(rep_fields)+1 z<-aggregate(y[,idx_first_data_field:dim(y)[2]],by=y[,survey_id_fields], sum) y<-aggregate(y[,idx_first_data_field:dim(y)[2]],by=y[,survey_id_fields], mean) y$nReps<-z$nReps y$nCounts<-z$nCounts return(y) } # End Calc_Site_nSurveysArea ### ------------------------------------------------------------- ### GENERATE SITE FISH BIOMASS # To use: Calc_Site_Bio(datafile, grouping_field) # grouping_field could be CONSUMER_GROUP, SPECIES, COMMON_FAMILY must be in datafile # returns dataframe with SITEVISITID, METHOD and MEAN SITE biomass (g per meter2 per grouping_field) ### ------------------------------------------------------------- Calc_Site_Bio<-function(x, grouping_field){ #add an Abundance m2 field to x x$Bio_gm2<-Calc_Biomassgm2(x) x$GROUP<-x[,grouping_field] #Replicate ID is the base unit .. so pool up biomass at ReplicateID level, for the field of interest base_cols=c("SITEVISITID","METHOD","REP", "REPLICATEID") # minimum set of fields to build up from pool_cols<-c(base_cols,"GROUP") # minimum set, plus the one we are interested in #first calculate total biomass per rep for all values of this field y<-aggregate(x$Bio_gm2,by=x[,pool_cols], sum) names(y)<-c(pool_cols, "Bio_gm2") #now format this more or less as a crosstab, with field of interest as column variable y<-cast(y, SITEVISITID + METHOD + REP + REPLICATEID ~ GROUP, fun.aggregate=sum, value="Bio_gm2", fill=0) #pool by Rep ("A","B","C" generally), then by site-survey (i.e. by SiteVisitID and Method) num_row_cols=length(base_cols) pool_cols<-c("SITEVISITID","METHOD","REP") y<-aggregate(y[,(num_row_cols+1):dim(y)[2]],by=y[,pool_cols], mean) num_row_cols=length(pool_cols) #working data now has fewer columns pool_cols<-c("SITEVISITID", "METHOD") y<-aggregate(y[,(num_row_cols+1):dim(y)[2]],by=y[,pool_cols], mean) return(y) } # End Calc_Site_Bio ### ------------------------------------------------------------- ### CALCULATE BIOMASS PER M2 PER RECORD # Support function for Calc_Site_Bio functions # Will not be called directly by user ### ------------------------------------------------------------- Calc_Biomassgm2<-function(x){ # IDW return y .. do not modify x inside the function .. just pass out biomassgm2 # do this elsewhere - keep this function doing one thing - calculating biomassgm2 y$Srvy.Yr<-as.factor(y$Srvy.Yr) Biomassperfish<-x$LW_A*((x$SIZE_*x$LENGTH_CONVERSION_FACTOR)^x$LW_B) Biomassperrecord<-Biomassperfish*x$COUNT Area<-ifelse(x$METHOD %in% c("nSPC", "nSPC-CCR"), pi*(7.5^2), ifelse(x$SIZE_ < 20, 50, 100)) return(Biomassperrecord/Area) } # End Calc_Biomassgm2 ### ------------------------------------------------------------- ### CALCULATE SITE FISH ABUNDANCE # To use: Calc_Site_Abund(datafile, grouping_field) # grouping_field could be CONSUMER_GROUP, SPECIES, COMMON_FAMILY must be in datafile # Returns dataframe with SITEVISITID, METHOD and MEAN SITE abundance (g per meter2 per grouping_field) ### ------------------------------------------------------------- Calc_Site_Abund<-function(x, grouping_field){ #add an Abundance m2 field to x x$Abund_m2<-Calc_Abundm2(x) #create pooling field to use in cast functions below x$GROUP<-x[,grouping_field] #Replicate ID is the base unit .. so pool up biomass at ReplicateID level, for the grouping field passed in base_cols=c("SITEVISITID", "METHOD", "REP", "REPLICATEID") # minimum set of fields to build up from pool_cols<-c(base_cols, "GROUP") # minimum set, plus the one we are interested in #first calculate total abundance per rep for all values of this field y<-aggregate(x$Abund_m2,by=x[,pool_cols], sum) names(y)<-c(pool_cols, "Abund_m2") #now format this more or less as a crosstab, with field of interest as column variable y<-cast(y, SITEVISITID + METHOD + REP + REPLICATEID ~ GROUP, fun.aggregate=sum, value="Abund_m2", fill=0) #pool by Rep ("A","B","C" generally), then by site-survey (i.e. by SiteVisitID and Method) num_row_cols=length(base_cols) pool_cols<-c("SITEVISITID","METHOD","REP") y<-aggregate(y[,(num_row_cols+1):dim(y)[2]],by=y[,pool_cols], mean) num_row_cols=length(pool_cols) #working data now has fewer columns pool_cols<-c("SITEVISITID", "METHOD") y<-aggregate(y[,(num_row_cols+1):dim(y)[2]],by=y[,pool_cols], mean) return(y) } # End Calc_Site_Abund ### ------------------------------------------------------------- ### CALCULATE ABUNDANCE PER M2 PER RECORD # Suport function for Calc_Site_Abund functions # Will not be called directly by user ### ------------------------------------------------------------- Calc_Abundm2<-function(x){ # IDW return y .. do not modify x inside the function .. just pass out biomassgm2 # do this elsewhere - keep this function doing one thing - calculating biomassgm2 y$Srvy.Yr<-as.factor(y$Srvy.Yr) Area<-ifelse(x$METHOD %in% c("nSPC", "nSPC-CCR"), pi*(7.5^2), ifelse(x$SIZE_ < 20, 50, 100)) return(x$COUNT/Area) } # End Calc_Abundm2 ### ------------------------------------------------------------- ### CALCULATE BIOMASS PER M2 PER SIZE CLASS # To use: Calc_Site_Bio_By_SizeClass(datafile, size_classes) # size_classes is a vector of number cut offs # returns dataframe with SITEVISITID mean estimates ### ------------------------------------------------------------- Calc_Site_Bio_By_SizeClass<-function(x, size_classes = c(0,10,20,30,40,50, Inf)){ #add a Biomassgm2 field to x x$Bio_gm2<-Calc_Biomassgm2(x) ##add a size class field to x x$sizeclass<-cut(x$SIZE_TL_CM, breaks = size_classes, include.lowest=TRUE) #Replicate ID is the base unit .. so pool up biomass at ReplicateID level, for the field of interest field_of_interest<-c("sizeclass") # this can later be a function parameter base_cols=c("SITEVISITID","METHOD","REP", "REPLICATEID") # minimum set of fields to build up from pool_cols<-c(base_cols,field_of_interest) # minimum set, plus the one we are interested in #first calculate total biomass per rep for all values of this field y<-aggregate(x$Bio_gm2,by=x[,pool_cols], sum) names(y)<-c(pool_cols, "Bio_gm2") #now format this more or less as a crosstab, with field of interest as column variable y<-cast(y, SITEVISITID + METHOD + REP + REPLICATEID ~ sizeclass, fun.aggregate=sum, value="Bio_gm2", fill=0) #pool by Rep ("A","B","C" generally), then by site-survey (i.e. by SiteVisitID and Method) num_row_cols=length(base_cols) pool_cols<-c("SITEVISITID","METHOD","REP") y<-aggregate(y[,(num_row_cols+1):dim(y)[2]],by=y[,pool_cols], mean) num_row_cols=length(pool_cols) #working data now has fewer columns pool_cols<-c("SITEVISITID", "METHOD") y<-aggregate(y[,(num_row_cols+1):dim(y)[2]],by=y[,pool_cols], mean) return(y) } # End Calc_Site_Bio_By_SizeClass ### ------------------------------------------------------------- ### CALCULATE ABUNDANCE PER M2 PER SIZE CLASS # To use: Calc_Site_Abund_By_SizeClass(datafile, size_classes) # datafile contains base_cols ("SITEVISITID","METHOD","REP", "REPLICATEID") and pool cols # size_classes is a vector to create size classes # returns dataframe with SITEVISITID mean estimates ### ------------------------------------------------------------- Calc_Site_Abund_By_SizeClass<-function(x, size_classes = c(0,10,20,30,40,50,Inf)){ #add an abundance field to x x$Abund_m2<-Calc_Abundm2(x) ##add a size class field to x x$sizeclass<-cut(x$SIZE_TL_CM, breaks = size_classes, include.lowest=TRUE) #Replicate ID is the base unit .. so pool up biomass at ReplicateID level, for the field of interest field_of_interest<-c("sizeclass") # this can later be a function parameter base_cols=c("SITEVISITID","METHOD","REP", "REPLICATEID") # minimum set of fields to build up from pool_cols<-c(base_cols,field_of_interest) # minimum set, plus the one we are interested in #first calculate total biomass per rep for all values of this field y<-aggregate(x$Abund_m2,by=x[,pool_cols], sum) names(y)<-c(pool_cols, "Abund_m2") #now format this more or less as a crosstab, with field of interest as column variable y<-cast(y, SITEVISITID + METHOD + REP + REPLICATEID ~ sizeclass, fun.aggregate=sum, value="Abund_m2", fill=0) #pool by Rep ("A","B","C" generally), then by site-survey (i.e. by SiteVisitID and Method) num_row_cols=length(base_cols) pool_cols<-c("SITEVISITID","METHOD","REP") y<-aggregate(y[,(num_row_cols+1):dim(y)[2]],by=y[,pool_cols], mean) num_row_cols=length(pool_cols) #working data now has fewer columns pool_cols<-c("SITEVISITID", "METHOD") y<-aggregate(y[,(num_row_cols+1):dim(y)[2]],by=y[,pool_cols], mean) return(y) } # End Calc_Site_Abund_By_SizeClass ### ------------------------------------------------------------- ### CALCULATE MEAN LENGTH OF FISH # To use: Calc_Site_Abund_By_SizeClass(datafile, minimum size) # datafile contains base_cols (SITEVISITID) and pool cols SIZE_TL_CM # fish smaller than minimum size are not included i.e removes small recruits # returns dataframe with SITEVISITID mean estimates ### ------------------------------------------------------------- Calc_Site_MeanLength<-function(x, min_size=1){ # function assumes that x is a data frame with at least the columns/elements listed in base_cols, plus the field_of_interest, in this case CommonFamily # function returns a data frame with Site_VisitID, Method, and mean size of fish (total length in cm) #Base unit will be the entire survey base_cols=c("SITEVISITID", "METHOD") pool_cols<-c(base_cols, "SIZE_TL_CM") #set count to zero for all sizes smaller than min size x[x$SIZE_TL_CM < (min_size),]$COUNT<-0 #sum total number offishes per SIZE_ y<-aggregate(x$COUNT,by=x[,pool_cols], sum) names(y)<-c(pool_cols, "COUNT") y$CS<-y$COUNT*y$SIZE_TL_CM #now format this more or less as a crosstab, with field of interest as column variable y<-aggregate(y[,c("COUNT", "CS")],by=y[,base_cols], sum) y$MEAN_SIZE<-y$CS/y$COUNT return(y[,c(base_cols, "MEAN_SIZE")]) } # End Calc_Site_MeanLength ### ------------------------------------------------------------- ### CALCULATE FISH SPECIES RICHNESS # To use: Calc_Site_Species_Rich(datafile) # Datafile contains cols "SITEVISITID", "METHOD", "REP", "SPECIES" # Species here means the number of unique species # Returns dataframe with SITEVISITID mean estimate ### ------------------------------------------------------------- Calc_Site_Species_Rich<-function(x){ # Modification fos tandard Calc_Site_Species_Richness to not count species with zero counts (as they can be left in data file to ensure that the site has data records at all) y<-aggregate(x$COUNT,by=x[,c("SITEVISITID", "METHOD", "REP", "SPECIES")], sum) #convert to count per species per rep y[y$x>1,]$x<-1 #convert any non-zero count to 1, so we can sum those to get total number of species with count>0 z<-aggregate(y$x,by=y[,c("SITEVISITID", "METHOD", "REP")], sum) # count number of species with non-zero counts this REP xx<-aggregate(z$x,by=z[,c("SITEVISITID", "METHOD")], mean) # count number of entries per rep dimnames(xx)[[2]]<-c("SITEVISITID", "METHOD", "SPECIESRICHNESS") return(xx) } # end Calc_Site_Species_Rich