#File S1
##############   Wild-ID matching- this is for the Wdump scores output ###################
################### Nathan F. Bendik, City of Austin, 10-8-2012 ##########################
####################### Special thanks to  Josh O'Brien ##################################

# NOTES
# The purpose of this R script is to generate a capture-history file (e.g. for input into MARK) based 
# on matching photo scores alone. Thus, only if Wild-ID scores are exceptionally predictive of a correct match, 
# will this be useful. However, one can also examine marginal scores and parse out a subset of matches to check: 
# for exmaple if it is known that scores >0.2 are always a correct match, but socres from 0.1 to 0.2 are only 
# sometimes a match, those marginal scores can be output and those photos manually compared from a photo database. 
# This can save time compared to the default process in Wild-ID in cases where the user has an exceptionally large dataset 
# (e.g. thousands of images), and/or with large groups of known non-matches (e.g. males and females, discrete capture 
# sessions where duplicates are known and/or avoided). 

# INPUT: a text file output by Java program Wdump written by Bennett Vance at Dartmouth.  
# Wdump is run following the form java -jar Wdump-552.jar WILD-ID-DB-FOLDER CROPPED-PHOTOS-FOLDER where the FOLDERS represent 
# the relative or absolute path (note- these must be in quotes using the command prompt in Windows).  
# Wdump outputes the "scores.tsv" file. Wdump can be found in the Wild-ID .zip download folder, here:
# http://www.dartmouth.edu/~envs/faculty/bolger.html

# PHOTO NAMES: the other necessary component for this script to work properly is that the photos analysed by Wild-ID 
# must be numerical names with the first 8 characters being the date in the YYYYMMDD format, and the remaining characters 
# of importance are the unique photo number (in this case, as assigned by the camera). I used a free batch-renaming software 
# program to accomplish this, but can also be done directly in R.  The code below strips out additional characters such as 
# DSC or .jpg that commonly show up in file names, although this can also be removed using any free batch renaming software 
# as well. The user may choose to use a different dating and naming scheme, but the R script will have to be adjusted accordingly. 
# Additionally, this also requires that photos were scored by Wild-ID in chronological order. 

# DUPLICATES: Only a single photo per individual per sampling period is allowed. If multiple photos from the same individual 
# on the same day exist when Wild-ID computes scores, this code will not work correctly, although it could generate output 
# that may look ok, it is not! Thus, 

### Scores file is dumped into the specified databased folder by Wdump
library(Matrix)
library(igraph)
library(stringr)
rm(list=ls(all=TRUE))
setwd('C:/whatevers') #your working directory here
scores=read.table("scores.tsv", blank.lines.skip = TRUE, fill= TRUE, header= FALSE) 

### Add headers
names(scores)[1] <- "photo"
names(scores)[2] <- "compared"
names(scores)[3] <- "score"
length(scores$photo) #how big is our dataset?

### Extract photo numbers
photo_no <- gsub("[a-zA-Z.-._]","",scores$photo)
compared_no <- gsub("[a-zA-Z.-._]","",scores$compared)
photo_no <-  substr(photo_no,9,13)
compared_no <-  substr(compared_no,9,13)
photo_no <- as.numeric(photo_no)
compared_no <- as.numeric(compared_no)
scores$photo_no <- photo_no
scores$compared_no <- compared_no

datematch <- as.Date((substr(scores$compared,0,8)),"%Y%m%d") #datematch is date from compared photo
datematch2 <- as.Date((substr(scores$photo,0,8)),"%Y%m%d")   #datematch2 is date from focal photo
datediff <- datematch2-datematch
scores$datediffs <- datediff

length(scores$score)

### Get rid of same-day matches (e.g. day 1 to day 1 comparisons)
matches1 <- scores[which(scores$datediffs!=0),]
matches1$day <- as.Date((substr(matches1$compared,0,8)),"%Y%m%d")

### Then sort by highest match and examine rank-1 matches only
x <- matches1$photo
y <- matches1$compared
z <- matches1$score
DF <- data.frame(x,y,z)
o <- order(x,-z)                  # determine order of values by photo name, then descending by score
o2 <- data.frame(x[o],y[o],z[o])  # apply that order to the data; 
kk <- match(o2$x.o.,x)            # get position of the highest value for each group
matches <- unique(o2[kk,])        # list of match for highest score for each photo pair, i.e. rank 1 photos 
matches$rownum <- 1:nrow(matches)
names(matches)[1] <- "photo"       #rename the values back to normal
names(matches)[2] <- "compared"
names(matches)[3] <- "score"
hist(matches$score, breaks=50)

### Since high score does not necessarily translate to a true match, we examine the data and filter 
# by a threshold value based on real-world experience with photos of known matches (known from double marks).  
# Alternatively, use this to examine potential gaps in the histogram that may  
# indicate where manual matching effort should be focused.

cutoff <- 0.1                              # Example: score cutoff of 0.1, test top 100 matches <0.1 by printing 1:100
x <- matches[which(matches$score<cutoff),] # Extract all rows below score threshold
ox <- order(-x$score)                      # sort matches...
matches_to_check <- x[ox,]                 # sort matches by score...
print(matches_to_check[1:100,])            
# Display and evaluate marginal scores manually in Adobe Lightroom (or other image analysis tool); 
# The resuts can be copy-pasted directly into search field

### Set matches we wish to use to construct capture histories
matched <- matches[which(matches$score>.04),] # Set the threshold value AND if necessary add or remove photos from dataset:
# matched <- matched[-INDEX,]                 # place photo number to remove from -INDEX, for example a false acceptance above score cutoff
# matched <- rbind(matched,matches[INDEX,])   # Add a line for each exception (a true match below the score cutoff)

### Generate list of unique photos
combine <- c(levels(scores$photo),levels(scores$compared))  # must combine b/c photo list is missing first and compared list is missing last 
uniquepics <- as.data.frame(unique(combine))
uniquepics <- data.frame(lapply(uniquepics, as.character), stringsAsFactors=FALSE) # get rid of factors

pics<-data.frame(uniquepics$unique)
names(pics)[1] <- "photo"
DF <- subset(merge(matched,pics,by="photo",all=TRUE),!is.na(photo))  # merge and include all uniques, not just matches

dates <- substr(DF$photo,0,8)     # substring first 8 digits of filename for date (this working depends on original photo name)
uniquedates <- unique(dates)      # unique dates across all samples
day <- seq(1:length(uniquedates)) # sequence of unique dates (this won't work if out of order so should check to make sure always in order)
temp <-data.frame(uniquedates,day)# temporary dataframe to hold the unique dates and the associated "days" 
DF$dates <- dates
DF <- merge(DF,temp,by.x="dates",by.y="uniquedates",all=TRUE) #marry the days to DF
DF$dates<-NULL                    # drop the date variable

DF$compared <- gsub("[a-zA-Z.-._]","",DF$compared) # some cleanup: keep date_photo#, drop other characters 
DF$photo <- gsub("[a-zA-Z.-._]","",DF$photo)
photo <- as.character(DF$photo) 
compared <- as.character(DF$compared) 

photos <- as.character(unique(DF$photo)) 
n <- length(photos) 
pairs <- subset(DF, !is.na(compared),     # logical expression indicating elements or rows to keep: missing values are taken as false
         select = c("photo", "compared")) # expression, indicating columns to select from a data frame.
pairs[] <- lapply(pairs, FUN=function(X) match(X, photos)) 
M <- 1 * with(pairs, sparseMatrix (i = c(seq_len(n), photo), 
                                   j = c(seq_len(n), compared)))
                                  
### Extract vectors of photos of the same individual 
(clust <- clusters(graph.adjacency(adjmatrix=M)))

### Process results of clustering to construct output data.frame
DF2 <- cbind(individual = clust$membership, 
             subset(DF, !duplicated(photo), select=c("photo", "day"))) 
			 grps <- tapply(DF2$photo, DF2$individual, paste, collapse=",") 
			 days <- tapply(DF2$day, DF2$individual, FUN=function(X) paste((1 * unique(DF$day) %in% X), collapse="")) 
		  
out <- data.frame(individual = as.numeric(names(grps)), photos = grps, days=days)
write.table(out, file="output.txt", sep='\t',append=F)

### this code will split the list of photos in the photos column into several separate columns and then rejoin 
### to the output dataset

n=length(uniquedates)
split_photos <- as.data.frame((str_split_fixed(out$photos, ",", n)))
for (i in 1:n){ names(split_photos)[i] <- paste("day",i) }
bunch <- data.frame(out,split_photos)
write.table(bunch, file="scores_capture_history_detail.txt", sep='\t',append=F) 

########## GENERATES MARK .inp FILE  ############
out$semi=as.character(";") #add a semicolon (MARK requires this)
out$count=(1) #add the number one (MARK requires this) representing one history record per individual
attach(out)
output <- paste(days,count,semi)
detach(out)
write(output,file="Wdump_scores_capture_history.inp") 
#################################################