###################################### #To run in Python, please save this text file as the ".py" file type, which can be run with the Python language. Make sure that Dataset_S4 is saved in the same folder as your .py python file. ###################################### #USER INPUT PARAMETERS #Enter the filename you have chosen for the Dataset_S4 spreadsheet inFile = 'Dataset_S4.xls' #Enter desired protein parameters. Leave 'none' for each if no preference. Surround each entry with a single quote ('entry') other than for functional_cluster_set, which requires double quotes ("[]") primary_sequence_set = 'none' #Enter arbitrary single letter amino acid sequence, i.e. 'MAGTD' kingdom_set = 'none' #Choose between 'Land Plants', 'Protist', 'Bacteria', 'Archaea', 'Fungi') phylum_set = 'none' #Enter a phylum of interest relevant to the selected kingdom primary_effector_set = 'none' #Chose a primary_effector "short_name" listed in Supplementary_Table_2 primary_effector_ontology = 'none' #Chose a primary_effector ontology listed in Figure_3 functional_cluster_set = "none" #Enter in format ['Domain1', 'Domain2', 'Domain3'] with Domains in alphabetical order (with domain "short_names" from Supplementary Table 2) photoadduct_motif_set = 'none' #Enter in format GXNCRFLQ linker_length_range = '0,10000' #Chose an desired range for the linker lengths in the format: minimum_length,maximum_length, ex: '0,100' parameter_set = [kingdom_set, phylum_set,primary_effector_set,primary_effector_ontology,functional_cluster_set, primary_sequence_set, photoadduct_motif_set, linker_length_range] #Enter desired output - choose between print to screen (yes or no) and print to excel print_to_screen = 'no' #Do you want results to print to the screen? print_to_excel = 'yes' #Do you want results to print to Excel? excel_file_name = 'Outfile.xls' #Enter name of desired output file ################################################## #PROGRAM EXECUTION (Please do not edit below) import xlrd import xlwt def Row_process(i): sequence = str(Data_table.cell(i,3).value) kingdom = str(Data_table.cell(i,13).value) phylum = str(Data_table.cell(i,14).value) primary_effector = str(Data_table.cell(i,8).value) primary_effector_ontology = str(Data_table.cell(i,9).value) Linker_length = str(Data_table.cell(i,10).value) functional_cluster = str(Data_table.cell(i,7).value) photoadduct_motif = str(Data_table.cell(i,4).value) return [kingdom, phylum, primary_effector,primary_effector_ontology,functional_cluster, sequence, photoadduct_motif,Linker_length] def outfile_generate(): wbnew = xlwt.Workbook() ws = wbnew.add_sheet('Datatable',cell_overwrite_ok=True) ws.write(0,0,'Database Source') ws.write(0,1,'Sequence ID') ws.write(0,2,'GenBank ID') ws.write(0,3,'Primary Structure') ws.write(0,4,'GXNCRFLQ Motif') ws.write(0,5,'Protein Length') ws.write(0,6,'Domain Structure') ws.write(0,7,'Functional Cluster') ws.write(0,8,'Primary Effector') ws.write(0,9,'Primary Effector Gene Ontology') ws.write(0,10,'Linker Length') ws.write(0,11,'Number of Predicted Transmembrane Helices') ws.write(0,12,'Transmembrane Helix Topology ') ws.write(0,13,'Kingdom') ws.write(0,14,'Phylum') ws.write(0,15,'Class') ws.write(0,16,'Family') ws.write(0,17,'Order') ws.write(0,18,'Genus') ws.write(0,19,'Species') return ws,wbnew def excel_print(i,excel_file_name,ws,wbnew): Database_source = str(Data_table.cell(i,0).value) ID = str(Data_table.cell(i,1).value) GenBank_ID = str(Data_table.cell(i,2).value) primary_structure = str(Data_table.cell(i,3).value) photoadduct_motif = str(Data_table.cell(i,4).value) Protein_length = str(Data_table.cell(i,5).value) Domain_structure = str(Data_table.cell(i,6).value) functional_cluster = str(Data_table.cell(i,7).value) primary_effector = str(Data_table.cell(i,8).value) primary_effector_ontology = str(Data_table.cell(i,9).value) Linker_length = str(Data_table.cell(i,10).value) Num_TMH = str(Data_table.cell(i,11).value) Arrangement_TMH = str(Data_table.cell(i,12).value) kingdom = str(Data_table.cell(i,13).value) phylum = str(Data_table.cell(i,14).value) class_taxon = str(Data_table.cell(i,15).value) family = str(Data_table.cell(i,16).value) order = str(Data_table.cell(i,17).value) genus = str(Data_table.cell(i,18).value) species = str(Data_table.cell(i,19).value) ws.write(c,0,str(Database_source)) ws.write(c,1,str(ID)) ws.write(c,2,str(GenBank_ID)) ws.write(c,3,str(primary_structure)) ws.write(c,4,str(photoadduct_motif)) ws.write(c,5,str(Protein_length)) ws.write(c,6,str(Domain_structure)) ws.write(c,7,str(functional_cluster)) ws.write(c,8,str(primary_effector)) ws.write(c,9,str(primary_effector_ontology)) ws.write(c,10,str(Linker_length)) ws.write(c,11,str(Num_TMH)) ws.write(c,12,str(Arrangement_TMH)) ws.write(c,13,kingdom) ws.write(c,14,phylum) ws.write(c,15,class_taxon) ws.write(c,16,family) ws.write(c,17,order) ws.write(c,18,genus) ws.write(c,19,species) wbnew.save(excel_file_name) c = 1 wb = xlrd.open_workbook(inFile) Data_table = wb.sheet_by_index(0) if print_to_excel == 'yes': ws,wbnew = outfile_generate() wbnew.save(excel_file_name) for i in range(4,(Data_table.nrows)): parameter_value = Row_process(i) for n in range(0,6): if parameter_set[n] != 'none' and parameter_set[n] not in parameter_value[n]: Selected = False break else: Selected = True if Selected == True: n = 7 if parameter_set[n] != 'none': length_range = parameter_set[n] length_range_start = length_range.split(',')[0] length_range_end = length_range.split(',')[1] if float(parameter_value[n]) > float(length_range_start) and float(parameter_value[n]) < float(length_range_end): Selected = True else: Selected = False if Selected == True and print_to_screen == 'yes': ID = str(Data_table.cell(i,1).value) Domain_structure = str(Data_table.cell(i,6).value) print 'ID: ' + ID + ' * Domain-structure: ' + Domain_structure + ' * Kingdom: ' + str(parameter_value[1]) + ' * Phylum: ' + str(parameter_value[2] + ' * Sequence: ' + str(parameter_value[0])) if Selected == True and print_to_excel == 'yes': excel_print(i,excel_file_name,ws,wbnew) c+=1 print 'done'