####### Supplemental script 1: Extracting data from Genpept/Genbank files
####### Repeated losses of PRDM9-directed recombination despite the conservation of PRDM9 across vertebrates
####### Authors: Zachary Baker, Molly Schumer, Yuki Haba, Lisa Bashkirova, Chris Holland, Gil G. Rosenthal, and M. Przeworski
####### To whom correspondence should be addressed: ztb2002@columbia.edu and mp3284@columbia.edu

library(msa)
library(seqinr)
directory = "/Users/NAME/"
PRDM9.domains = c("KRAB","KRAB_A-box","SSXRD","SET")
####################################################################################
##### Functions
####################################################################################
## convertGenPept
#  takes in list containing a single GenPept file from readLine
#  returns: LOCUS, DEFINITION, VERSION, ORGANISM, CDS coordinates, sequence, gene/locus_tag 
#  for GenPept, also: DBSOURCE, domain coordinates
convertGenPept = function(rawfile, domains=PRDM9.domains, mode="pept") {
  
  ## pull out LOCUS
  LOCUS.pos = grep("LOCUS       ",rawfile)
  if (length(LOCUS.pos) == 1) {
    NP_number = strsplit(rawfile[LOCUS.pos]," ")[[1]][which(strsplit(rawfile[LOCUS.pos]," ")[[1]] != "")][2]
    aa_length = strsplit(rawfile[LOCUS.pos]," ")[[1]][which(strsplit(rawfile[LOCUS.pos]," ")[[1]] != "")][3]
  } else {
    NP_number = "error"
    aa_length = 0
  }
  
  ## pull out DEFINITION
  DEFINITION.pos = grep("DEFINITION  ",rawfile)
  ACCESSION.pos = grep("ACCESSION   ",rawfile)
  if (length(DEFINITION.pos) == 1 && length(ACCESSION.pos) == 1) {
    definition = paste(strsplit(paste(rawfile[DEFINITION.pos:(ACCESSION.pos-1)],collapse="")," ")[[1]][which(strsplit(paste(rawfile[DEFINITION.pos:(ACCESSION.pos-1)],collapse="")," ")[[1]] != "")][2:length(strsplit(paste(rawfile[DEFINITION.pos:(ACCESSION.pos-1)],collapse="")," ")[[1]][which(strsplit(paste(rawfile[DEFINITION.pos:(ACCESSION.pos-1)],collapse="")," ")[[1]] != "")])],collapse=" ")
  } else {
    definition = "error"
  }
  
  ## pull out VERSION
  VERSION.pos = grep("VERSION     ",rawfile)
  if (length(VERSION.pos) == 1) {
    version = strsplit(rawfile[VERSION.pos]," ")[[1]][which(strsplit(rawfile[VERSION.pos]," ")[[1]] != "")][2]
  } else {
    version = "error"
  }
  
  ## pull out ORGANISM
  ORGANISM.pos = grep("  ORGANISM  ",rawfile)
  if (length(ORGANISM.pos) == 1) {
    organism = strsplit(rawfile[ORGANISM.pos]," ")[[1]][which(strsplit(rawfile[ORGANISM.pos]," ")[[1]] != "")]
    organism = paste(organism[2:length(organism)],collapse=" ")
  } else {
    organism = "error"
  }
  
  ## pull out CDS
  CDS.coords = grep("     CDS             ",rawfile)
  if (length(CDS.coords) == 1) {
    CDS.coords = (strsplit(strsplit(rawfile[CDS.coords]," ")[[1]][which(strsplit(rawfile[CDS.coords]," ")[[1]] != "")][2],"\\.\\.")[[1]])
  } else {
    CDS.coords = c("error","error")
  }
  
  ## pull out sequence
  ORIGIN.pos = grep("ORIGIN      ",rawfile)+1
  END.pos = which(rawfile == "//")-1
  if ((length(ORIGIN.pos) == 1) && (length(END.pos) == 1)) {
    protein.seq = toupper(c2s(s2c(paste(rawfile[ORIGIN.pos:END.pos],collapse=""))[which(!(s2c(paste(rawfile[ORIGIN.pos:END.pos],collapse="")) %in% c(" ",0:9)))]))
  } else {
    protein.seq = "error"
  }
  
  ## pull out Gene-name
  Gene.pos = grep("                     /gene=\"",rawfile)
  if (length(Gene.pos) > 0) {
    Gene.pos = Gene.pos[1]
    gene.name = substring(rawfile[Gene.pos],29,(nchar(rawfile[Gene.pos])-1))
  } else {
    Gene.pos = grep("                     /locus_tag=\"",rawfile)
    if (length(Gene.pos) > 0) {
      Gene.pos = Gene.pos[1]
      gene.name = substring(rawfile[Gene.pos],34,(nchar(rawfile[Gene.pos])-1))
    } else {
      
      gene.name = "error"
    }
  }
  
  ## GenPept only
  if (mode == "pept") {
    ## pull out DBSOURCE (refseq NM)
    DBSOURCE.pos = grep("DBSOURCE    ",rawfile)
    if (length(DBSOURCE.pos) == 1) {
      NM_number = strsplit(rawfile[DBSOURCE.pos]," ")[[1]][which(strsplit(rawfile[DBSOURCE.pos]," ")[[1]] != "")][4]
    } else {
      NM_number = "error"
    }
    
    ## pull out region_name coordinates
    DOMAIN.coords = NULL
    if (length(domains) > 0) {
      for (domain in domains) {
        temp.coords = grep(paste("/region_name=\"",domain,"\"",sep=""),rawfile)
        if (length(temp.coords) >= 1) {
          temp.coords = temp.coords-1
          temp.coords = (strsplit(strsplit(rawfile[temp.coords]," ")[[1]][which(strsplit(rawfile[temp.coords]," ")[[1]] != "")][2],"\\.\\.")[[1]])
        } else {
          temp.coords = c(0,0)
        }
        names(temp.coords) = paste(domain,c("start","end"),sep="-")
        DOMAIN.coords = c(DOMAIN.coords, temp.coords)
      }
    }
    
    ## result
    result = c(NP_number, gene.name, aa_length, definition, version, NM_number, organism, CDS.coords, DOMAIN.coords, protein.seq)
    names(result) = c("NP_number","Gene","aa_length","definition","version","NM_number","organism","cds.start","cds.end",names(DOMAIN.coords),"aa_seq")
    result = t(as.data.frame(result))
  } else {
    ## GenBank only
    result = c(NP_number, gene.name, aa_length, definition, version, organism, CDS.coords, protein.seq)
    names(result) = c("NM_number","Gene","dna_length","definition","version","organism","cds.start","cds.end","dnaa_seq")
    result = t(as.data.frame(result))
  }
  
  result
}
## readGenPept
#  takes in file name corresponding to file with multiple GenPept entries
#  converts each file and creates table with data extracted
readGenPept = function(fl.name, type = "pept") {
  print("Reading GenPept File")
  ## Reading in genpept file
  all.gp = readLines(fl.name)
  
  ## Separate into list of unique genpept files
  dividers = which(all.gp == "")
  dividers = c(0,dividers)
  gp.files = list()
  for (i in 2:length(dividers)) {
    gp.files[[i-1]] = all.gp[(dividers[i-1]+1):(dividers[i]-1)]
  }
  #pb = txtProgressBar(0,length(gp.files),style = 3)
  result = t(sapply(1:length(gp.files), function(x) {
    print(x)
    #setTxtProgressBar(pb,x)
    convertGenPept(gp.files[[x]], mode = type)
  }))
  #close(pb)
  
  result = as.data.frame(result, stringsAsFactors = FALSE)
  result
}
####################################################################################

####################################################################################
# Step 1: Blastp Hsap Nterm PRDM9 sequence against vertebrate RefSeq proteins
####################################################################################
### Sequence used is saved in the directory/query.fa
### Threshold of E =< 1e-5
### Download GenPept file, save as directory/RefSeq-genes/Homo-sapien-prdm9-query-blastp-e-val-cutoff-1e-5.gp.txt
####################################################################################

####################################################################################
# Step 2: Retrieve information from GenPept files
####################################################################################
GP.table = readGenPept(paste(directory,"RefSeq-genes/Homo-sapien-prdm9-query-blastp-e-val-cutoff-1e-5.gp.txt",sep = ""))
colnames(GP.table) = c("LOCUS","Gene","aa_length","definition","VERSION","DBSOURCE","ORGANISM","CDS.start","CDS.end",
                       "KRAB.start","KRAB.end","KRABa.start","KRABa.end","SSXRD.start","SSXRD.end","SET.start","SET.end","SEQ")

## combine coordinates from KRAB and KRAB_A-box domains into one region
for (i in 1:length(GP.table[,1])) {
  if ((GP.table$KRABa.start[i] < GP.table$KRAB.start[i]) && (GP.table$KRABa.start[i] != 0)) {
    GP.table$KRAB.start[i] = GP.table$KRABa.start[i]
  }
  if (GP.table$KRABa.end[i] > GP.table$KRAB.end[i]) {
    GP.table$KRAB.end[i] = GP.table$KRABa.end[i]
  }
}  

## Assign structure
GP.table$structure = ""
GP.table$structure[which(GP.table$KRAB.end != 0)] = paste(GP.table$structure[which(GP.table$KRAB.end != 0)],
                                                          "KRAB-",sep="")
GP.table$structure[which(GP.table$SSXRD.end != 0)] = paste(GP.table$structure[which(GP.table$SSXRD.end != 0)],
                                                           "SSXRD-",sep="")
GP.table$structure[which(GP.table$SET.end != 0)] = paste(GP.table$structure[which(GP.table$SET.end != 0)],
                                                         "SET",sep="")

## PRDM9 presumably needs the SET domain to function in recombination, so remove genes without it.
GP.table = GP.table[grep("SET",GP.table$structure),]

## Keep the longest transcript for each locus
GP.table$unique.ID = paste(GP.table$ORGANISM,GP.table$Gene,sep=".")
GP.table$unique.ID = unlist(lapply(strsplit(GP.table$unique.ID," "), FUN=function(x) paste(x,collapse=".")))

genes.w.duplicates = unique(GP.table$unique.ID[duplicated(GP.table$unique.ID)])
genes.to.remove = NULL
for (i in 1:length(genes.w.duplicates)) {
  genes = which(GP.table$unique.ID == genes.w.duplicates[i])
  ## chose biggest protein seq
  chosen1 = which(GP.table$aa_length[genes] == max(GP.table$aa_length[genes]))[1]
  genes.to.remove = c(genes.to.remove, genes[which(genes != genes[chosen1])])
}
# removing the rows
GP.table = GP.table[(1:length(GP.table[,1]))[which(!((1:length(GP.table[,1])) %in% genes.to.remove))],]
####################################################################################

####################################################################################
# Step 3: Retrieve GenBank files
####################################################################################
write.table(GP.table$DBSOURCE,paste(directory,"RefSeq-genes/accessions.temp1.txt",sep=""),
            quote = FALSE, col.names = FALSE, row.names = FALSE)
### Submit accessions through Batch Entrez and download GenPept file of hits
### Manually combine GenBank files into 1 txt file, save as directory/RefSeq-genes/Hsap_Nterm_PRDM9_blastp_Vertebrates_1e-5.gb.txt
####################################################################################

####################################################################################
# Step 4: Retrieve information from GenBank files
####################################################################################
GB.table = readGenPept(paste(directory,"RefSeq-genes/Homo-sapien-prdm9-query-blastp-e-val-cutoff-1e-5.gb.txt",sep = ""), type = "bank")
colnames(GB.table) = c("LOCUS","Gene","dna_length","definition","VERSION","ORGANISM","CDS.start","CDS.end","SEQ")
####################################################################################

####################################################################################
# Step 5: Combine GenPept and GenBank tables
####################################################################################
### put GB.table is same order as GP.table
GP.table = GP.table[order(GP.table$DBSOURCE),]
GB.table = GB.table[order(GB.table$VERSION),]
### add info to GP.table
GB.table = GB.table[,c("LOCUS","dna_length","definition","CDS.start","CDS.end","SEQ")]
colnames(GB.table) = paste("dna",colnames(GB.table),sep=".")
colnames(GB.table)[2] = "dna_length"
GP.table = cbind(GP.table, GB.table)
### remove GB.table, save GP.table
rm(GB.table)
write.table(GP.table, paste(directory,"GP.table-all.hits.txt"))
####################################################################################