####### Supplemental script 1: Extracting data from Genpept/Genbank files ####### Repeated losses of PRDM9-directed recombination despite the conservation of PRDM9 across vertebrates ####### Authors: Zachary Baker, Molly Schumer, Yuki Haba, Lisa Bashkirova, Chris Holland, Gil G. Rosenthal, and M. Przeworski ####### To whom correspondence should be addressed: ztb2002@columbia.edu and mp3284@columbia.edu library(msa) library(seqinr) directory = "/Users/NAME/" PRDM9.domains = c("KRAB","KRAB_A-box","SSXRD","SET") #################################################################################### ##### Functions #################################################################################### ## convertGenPept # takes in list containing a single GenPept file from readLine # returns: LOCUS, DEFINITION, VERSION, ORGANISM, CDS coordinates, sequence, gene/locus_tag # for GenPept, also: DBSOURCE, domain coordinates convertGenPept = function(rawfile, domains=PRDM9.domains, mode="pept") { ## pull out LOCUS LOCUS.pos = grep("LOCUS ",rawfile) if (length(LOCUS.pos) == 1) { NP_number = strsplit(rawfile[LOCUS.pos]," ")[[1]][which(strsplit(rawfile[LOCUS.pos]," ")[[1]] != "")][2] aa_length = strsplit(rawfile[LOCUS.pos]," ")[[1]][which(strsplit(rawfile[LOCUS.pos]," ")[[1]] != "")][3] } else { NP_number = "error" aa_length = 0 } ## pull out DEFINITION DEFINITION.pos = grep("DEFINITION ",rawfile) ACCESSION.pos = grep("ACCESSION ",rawfile) if (length(DEFINITION.pos) == 1 && length(ACCESSION.pos) == 1) { definition = paste(strsplit(paste(rawfile[DEFINITION.pos:(ACCESSION.pos-1)],collapse="")," ")[[1]][which(strsplit(paste(rawfile[DEFINITION.pos:(ACCESSION.pos-1)],collapse="")," ")[[1]] != "")][2:length(strsplit(paste(rawfile[DEFINITION.pos:(ACCESSION.pos-1)],collapse="")," ")[[1]][which(strsplit(paste(rawfile[DEFINITION.pos:(ACCESSION.pos-1)],collapse="")," ")[[1]] != "")])],collapse=" ") } else { definition = "error" } ## pull out VERSION VERSION.pos = grep("VERSION ",rawfile) if (length(VERSION.pos) == 1) { version = strsplit(rawfile[VERSION.pos]," ")[[1]][which(strsplit(rawfile[VERSION.pos]," ")[[1]] != "")][2] } else { version = "error" } ## pull out ORGANISM ORGANISM.pos = grep(" ORGANISM ",rawfile) if (length(ORGANISM.pos) == 1) { organism = strsplit(rawfile[ORGANISM.pos]," ")[[1]][which(strsplit(rawfile[ORGANISM.pos]," ")[[1]] != "")] organism = paste(organism[2:length(organism)],collapse=" ") } else { organism = "error" } ## pull out CDS CDS.coords = grep(" CDS ",rawfile) if (length(CDS.coords) == 1) { CDS.coords = (strsplit(strsplit(rawfile[CDS.coords]," ")[[1]][which(strsplit(rawfile[CDS.coords]," ")[[1]] != "")][2],"\\.\\.")[[1]]) } else { CDS.coords = c("error","error") } ## pull out sequence ORIGIN.pos = grep("ORIGIN ",rawfile)+1 END.pos = which(rawfile == "//")-1 if ((length(ORIGIN.pos) == 1) && (length(END.pos) == 1)) { protein.seq = toupper(c2s(s2c(paste(rawfile[ORIGIN.pos:END.pos],collapse=""))[which(!(s2c(paste(rawfile[ORIGIN.pos:END.pos],collapse="")) %in% c(" ",0:9)))])) } else { protein.seq = "error" } ## pull out Gene-name Gene.pos = grep(" /gene=\"",rawfile) if (length(Gene.pos) > 0) { Gene.pos = Gene.pos[1] gene.name = substring(rawfile[Gene.pos],29,(nchar(rawfile[Gene.pos])-1)) } else { Gene.pos = grep(" /locus_tag=\"",rawfile) if (length(Gene.pos) > 0) { Gene.pos = Gene.pos[1] gene.name = substring(rawfile[Gene.pos],34,(nchar(rawfile[Gene.pos])-1)) } else { gene.name = "error" } } ## GenPept only if (mode == "pept") { ## pull out DBSOURCE (refseq NM) DBSOURCE.pos = grep("DBSOURCE ",rawfile) if (length(DBSOURCE.pos) == 1) { NM_number = strsplit(rawfile[DBSOURCE.pos]," ")[[1]][which(strsplit(rawfile[DBSOURCE.pos]," ")[[1]] != "")][4] } else { NM_number = "error" } ## pull out region_name coordinates DOMAIN.coords = NULL if (length(domains) > 0) { for (domain in domains) { temp.coords = grep(paste("/region_name=\"",domain,"\"",sep=""),rawfile) if (length(temp.coords) >= 1) { temp.coords = temp.coords-1 temp.coords = (strsplit(strsplit(rawfile[temp.coords]," ")[[1]][which(strsplit(rawfile[temp.coords]," ")[[1]] != "")][2],"\\.\\.")[[1]]) } else { temp.coords = c(0,0) } names(temp.coords) = paste(domain,c("start","end"),sep="-") DOMAIN.coords = c(DOMAIN.coords, temp.coords) } } ## result result = c(NP_number, gene.name, aa_length, definition, version, NM_number, organism, CDS.coords, DOMAIN.coords, protein.seq) names(result) = c("NP_number","Gene","aa_length","definition","version","NM_number","organism","cds.start","cds.end",names(DOMAIN.coords),"aa_seq") result = t(as.data.frame(result)) } else { ## GenBank only result = c(NP_number, gene.name, aa_length, definition, version, organism, CDS.coords, protein.seq) names(result) = c("NM_number","Gene","dna_length","definition","version","organism","cds.start","cds.end","dnaa_seq") result = t(as.data.frame(result)) } result } ## readGenPept # takes in file name corresponding to file with multiple GenPept entries # converts each file and creates table with data extracted readGenPept = function(fl.name, type = "pept") { print("Reading GenPept File") ## Reading in genpept file all.gp = readLines(fl.name) ## Separate into list of unique genpept files dividers = which(all.gp == "") dividers = c(0,dividers) gp.files = list() for (i in 2:length(dividers)) { gp.files[[i-1]] = all.gp[(dividers[i-1]+1):(dividers[i]-1)] } #pb = txtProgressBar(0,length(gp.files),style = 3) result = t(sapply(1:length(gp.files), function(x) { print(x) #setTxtProgressBar(pb,x) convertGenPept(gp.files[[x]], mode = type) })) #close(pb) result = as.data.frame(result, stringsAsFactors = FALSE) result } #################################################################################### #################################################################################### # Step 1: Blastp Hsap Nterm PRDM9 sequence against vertebrate RefSeq proteins #################################################################################### ### Sequence used is saved in the directory/query.fa ### Threshold of E =< 1e-5 ### Download GenPept file, save as directory/RefSeq-genes/Homo-sapien-prdm9-query-blastp-e-val-cutoff-1e-5.gp.txt #################################################################################### #################################################################################### # Step 2: Retrieve information from GenPept files #################################################################################### GP.table = readGenPept(paste(directory,"RefSeq-genes/Homo-sapien-prdm9-query-blastp-e-val-cutoff-1e-5.gp.txt",sep = "")) colnames(GP.table) = c("LOCUS","Gene","aa_length","definition","VERSION","DBSOURCE","ORGANISM","CDS.start","CDS.end", "KRAB.start","KRAB.end","KRABa.start","KRABa.end","SSXRD.start","SSXRD.end","SET.start","SET.end","SEQ") ## combine coordinates from KRAB and KRAB_A-box domains into one region for (i in 1:length(GP.table[,1])) { if ((GP.table$KRABa.start[i] < GP.table$KRAB.start[i]) && (GP.table$KRABa.start[i] != 0)) { GP.table$KRAB.start[i] = GP.table$KRABa.start[i] } if (GP.table$KRABa.end[i] > GP.table$KRAB.end[i]) { GP.table$KRAB.end[i] = GP.table$KRABa.end[i] } } ## Assign structure GP.table$structure = "" GP.table$structure[which(GP.table$KRAB.end != 0)] = paste(GP.table$structure[which(GP.table$KRAB.end != 0)], "KRAB-",sep="") GP.table$structure[which(GP.table$SSXRD.end != 0)] = paste(GP.table$structure[which(GP.table$SSXRD.end != 0)], "SSXRD-",sep="") GP.table$structure[which(GP.table$SET.end != 0)] = paste(GP.table$structure[which(GP.table$SET.end != 0)], "SET",sep="") ## PRDM9 presumably needs the SET domain to function in recombination, so remove genes without it. GP.table = GP.table[grep("SET",GP.table$structure),] ## Keep the longest transcript for each locus GP.table$unique.ID = paste(GP.table$ORGANISM,GP.table$Gene,sep=".") GP.table$unique.ID = unlist(lapply(strsplit(GP.table$unique.ID," "), FUN=function(x) paste(x,collapse="."))) genes.w.duplicates = unique(GP.table$unique.ID[duplicated(GP.table$unique.ID)]) genes.to.remove = NULL for (i in 1:length(genes.w.duplicates)) { genes = which(GP.table$unique.ID == genes.w.duplicates[i]) ## chose biggest protein seq chosen1 = which(GP.table$aa_length[genes] == max(GP.table$aa_length[genes]))[1] genes.to.remove = c(genes.to.remove, genes[which(genes != genes[chosen1])]) } # removing the rows GP.table = GP.table[(1:length(GP.table[,1]))[which(!((1:length(GP.table[,1])) %in% genes.to.remove))],] #################################################################################### #################################################################################### # Step 3: Retrieve GenBank files #################################################################################### write.table(GP.table$DBSOURCE,paste(directory,"RefSeq-genes/accessions.temp1.txt",sep=""), quote = FALSE, col.names = FALSE, row.names = FALSE) ### Submit accessions through Batch Entrez and download GenPept file of hits ### Manually combine GenBank files into 1 txt file, save as directory/RefSeq-genes/Hsap_Nterm_PRDM9_blastp_Vertebrates_1e-5.gb.txt #################################################################################### #################################################################################### # Step 4: Retrieve information from GenBank files #################################################################################### GB.table = readGenPept(paste(directory,"RefSeq-genes/Homo-sapien-prdm9-query-blastp-e-val-cutoff-1e-5.gb.txt",sep = ""), type = "bank") colnames(GB.table) = c("LOCUS","Gene","dna_length","definition","VERSION","ORGANISM","CDS.start","CDS.end","SEQ") #################################################################################### #################################################################################### # Step 5: Combine GenPept and GenBank tables #################################################################################### ### put GB.table is same order as GP.table GP.table = GP.table[order(GP.table$DBSOURCE),] GB.table = GB.table[order(GB.table$VERSION),] ### add info to GP.table GB.table = GB.table[,c("LOCUS","dna_length","definition","CDS.start","CDS.end","SEQ")] colnames(GB.table) = paste("dna",colnames(GB.table),sep=".") colnames(GB.table)[2] = "dna_length" GP.table = cbind(GP.table, GB.table) ### remove GB.table, save GP.table rm(GB.table) write.table(GP.table, paste(directory,"GP.table-all.hits.txt")) ####################################################################################