#File S1 ############## Wild-ID matching- this is for the Wdump scores output ################### ################### Nathan F. Bendik, City of Austin, 10-8-2012 ########################## ####################### Special thanks to Josh O'Brien ################################## # NOTES # The purpose of this R script is to generate a capture-history file (e.g. for input into MARK) based # on matching photo scores alone. Thus, only if Wild-ID scores are exceptionally predictive of a correct match, # will this be useful. However, one can also examine marginal scores and parse out a subset of matches to check: # for exmaple if it is known that scores >0.2 are always a correct match, but socres from 0.1 to 0.2 are only # sometimes a match, those marginal scores can be output and those photos manually compared from a photo database. # This can save time compared to the default process in Wild-ID in cases where the user has an exceptionally large dataset # (e.g. thousands of images), and/or with large groups of known non-matches (e.g. males and females, discrete capture # sessions where duplicates are known and/or avoided). # INPUT: a text file output by Java program Wdump written by Bennett Vance at Dartmouth. # Wdump is run following the form java -jar Wdump-552.jar WILD-ID-DB-FOLDER CROPPED-PHOTOS-FOLDER where the FOLDERS represent # the relative or absolute path (note- these must be in quotes using the command prompt in Windows). # Wdump outputes the "scores.tsv" file. Wdump can be found in the Wild-ID .zip download folder, here: # http://www.dartmouth.edu/~envs/faculty/bolger.html # PHOTO NAMES: the other necessary component for this script to work properly is that the photos analysed by Wild-ID # must be numerical names with the first 8 characters being the date in the YYYYMMDD format, and the remaining characters # of importance are the unique photo number (in this case, as assigned by the camera). I used a free batch-renaming software # program to accomplish this, but can also be done directly in R. The code below strips out additional characters such as # DSC or .jpg that commonly show up in file names, although this can also be removed using any free batch renaming software # as well. The user may choose to use a different dating and naming scheme, but the R script will have to be adjusted accordingly. # Additionally, this also requires that photos were scored by Wild-ID in chronological order. # DUPLICATES: Only a single photo per individual per sampling period is allowed. If multiple photos from the same individual # on the same day exist when Wild-ID computes scores, this code will not work correctly, although it could generate output # that may look ok, it is not! Thus, ### Scores file is dumped into the specified databased folder by Wdump library(Matrix) library(igraph) library(stringr) rm(list=ls(all=TRUE)) setwd('C:/whatevers') #your working directory here scores=read.table("scores.tsv", blank.lines.skip = TRUE, fill= TRUE, header= FALSE) ### Add headers names(scores)[1] <- "photo" names(scores)[2] <- "compared" names(scores)[3] <- "score" length(scores$photo) #how big is our dataset? ### Extract photo numbers photo_no <- gsub("[a-zA-Z.-._]","",scores$photo) compared_no <- gsub("[a-zA-Z.-._]","",scores$compared) photo_no <- substr(photo_no,9,13) compared_no <- substr(compared_no,9,13) photo_no <- as.numeric(photo_no) compared_no <- as.numeric(compared_no) scores$photo_no <- photo_no scores$compared_no <- compared_no datematch <- as.Date((substr(scores$compared,0,8)),"%Y%m%d") #datematch is date from compared photo datematch2 <- as.Date((substr(scores$photo,0,8)),"%Y%m%d") #datematch2 is date from focal photo datediff <- datematch2-datematch scores$datediffs <- datediff length(scores$score) ### Get rid of same-day matches (e.g. day 1 to day 1 comparisons) matches1 <- scores[which(scores$datediffs!=0),] matches1$day <- as.Date((substr(matches1$compared,0,8)),"%Y%m%d") ### Then sort by highest match and examine rank-1 matches only x <- matches1$photo y <- matches1$compared z <- matches1$score DF <- data.frame(x,y,z) o <- order(x,-z) # determine order of values by photo name, then descending by score o2 <- data.frame(x[o],y[o],z[o]) # apply that order to the data; kk <- match(o2$x.o.,x) # get position of the highest value for each group matches <- unique(o2[kk,]) # list of match for highest score for each photo pair, i.e. rank 1 photos matches$rownum <- 1:nrow(matches) names(matches)[1] <- "photo" #rename the values back to normal names(matches)[2] <- "compared" names(matches)[3] <- "score" hist(matches$score, breaks=50) ### Since high score does not necessarily translate to a true match, we examine the data and filter # by a threshold value based on real-world experience with photos of known matches (known from double marks). # Alternatively, use this to examine potential gaps in the histogram that may # indicate where manual matching effort should be focused. cutoff <- 0.1 # Example: score cutoff of 0.1, test top 100 matches <0.1 by printing 1:100 x <- matches[which(matches$score.04),] # Set the threshold value AND if necessary add or remove photos from dataset: # matched <- matched[-INDEX,] # place photo number to remove from -INDEX, for example a false acceptance above score cutoff # matched <- rbind(matched,matches[INDEX,]) # Add a line for each exception (a true match below the score cutoff) ### Generate list of unique photos combine <- c(levels(scores$photo),levels(scores$compared)) # must combine b/c photo list is missing first and compared list is missing last uniquepics <- as.data.frame(unique(combine)) uniquepics <- data.frame(lapply(uniquepics, as.character), stringsAsFactors=FALSE) # get rid of factors pics<-data.frame(uniquepics$unique) names(pics)[1] <- "photo" DF <- subset(merge(matched,pics,by="photo",all=TRUE),!is.na(photo)) # merge and include all uniques, not just matches dates <- substr(DF$photo,0,8) # substring first 8 digits of filename for date (this working depends on original photo name) uniquedates <- unique(dates) # unique dates across all samples day <- seq(1:length(uniquedates)) # sequence of unique dates (this won't work if out of order so should check to make sure always in order) temp <-data.frame(uniquedates,day)# temporary dataframe to hold the unique dates and the associated "days" DF$dates <- dates DF <- merge(DF,temp,by.x="dates",by.y="uniquedates",all=TRUE) #marry the days to DF DF$dates<-NULL # drop the date variable DF$compared <- gsub("[a-zA-Z.-._]","",DF$compared) # some cleanup: keep date_photo#, drop other characters DF$photo <- gsub("[a-zA-Z.-._]","",DF$photo) photo <- as.character(DF$photo) compared <- as.character(DF$compared) photos <- as.character(unique(DF$photo)) n <- length(photos) pairs <- subset(DF, !is.na(compared), # logical expression indicating elements or rows to keep: missing values are taken as false select = c("photo", "compared")) # expression, indicating columns to select from a data frame. pairs[] <- lapply(pairs, FUN=function(X) match(X, photos)) M <- 1 * with(pairs, sparseMatrix (i = c(seq_len(n), photo), j = c(seq_len(n), compared))) ### Extract vectors of photos of the same individual (clust <- clusters(graph.adjacency(adjmatrix=M))) ### Process results of clustering to construct output data.frame DF2 <- cbind(individual = clust$membership, subset(DF, !duplicated(photo), select=c("photo", "day"))) grps <- tapply(DF2$photo, DF2$individual, paste, collapse=",") days <- tapply(DF2$day, DF2$individual, FUN=function(X) paste((1 * unique(DF$day) %in% X), collapse="")) out <- data.frame(individual = as.numeric(names(grps)), photos = grps, days=days) write.table(out, file="output.txt", sep='\t',append=F) ### this code will split the list of photos in the photos column into several separate columns and then rejoin ### to the output dataset n=length(uniquedates) split_photos <- as.data.frame((str_split_fixed(out$photos, ",", n))) for (i in 1:n){ names(split_photos)[i] <- paste("day",i) } bunch <- data.frame(out,split_photos) write.table(bunch, file="scores_capture_history_detail.txt", sep='\t',append=F) ########## GENERATES MARK .inp FILE ############ out$semi=as.character(";") #add a semicolon (MARK requires this) out$count=(1) #add the number one (MARK requires this) representing one history record per individual attach(out) output <- paste(days,count,semi) detach(out) write(output,file="Wdump_scores_capture_history.inp") #################################################