############## import numpy as np import pandas as pd def coverage_ranking(min_score=99, L_longmer=25, pop='AVG', n_to_choose=12, min_peps=3): """ Return two dataframes of 25mers (for L_longmer=25): df_selected - dataframe with n_to_choose rows, one per 25mer, ranked by cumulative (pop) population coverage df_abs - dataframe with one row per 25mer for all 25mers containing at least min_peps predicted binders, sorted by absolute (pop) population coverage. Binders are defined as any 12-20mers within the SARS-CoV-2 proteins that are predicted to bind to an allele with model_score > min_score Return list of 25mers within the nCoV proteins which cover the largest fraction of the population based on the high-scoring peptides they contain and the alleles they are predicted to bind to. """ # dummy function that returns all 12-20mers within SARS-CoV-2 proteins, # and their predicted binding scores to covered alleles (model_score) df_binders = get_all_covid_binders() df_binders = df_binders.loc[df_binders['model_score'] >= min_score] # this is a dummy function, would return dataframe with one row per # 25mer within all the SARS-CoV-2 proteins df_25mers = get_all_covid_25mers() # dataframe of HLA-II population frequencies df_freq = hla_freq() # keep track of list of covered alleles and exclude them when calculating # coverage in subsequent 25mer selections excluded_hla = [] n_selected = 0 df_selected = pd.DataFrame() while n_selected < n_to_choose: print('\n\nmaking selection %d of %d...' % ((n_selected+1), n_to_choose)) df_keep = df_binders.loc[~df_binders['allele'].isin(excluded_hla)] df_list = df_keep[['peptide', 'allele']].groupby('peptide').aggregate(lambda tdf: tdf.unique().tolist()) df_list = df_list.reset_index() pep_to_allele = pd.Series(df_list.allele.values, index=df_list.peptide).to_dict() # associate each 25mer with peptides predicted to bind binders = list(df_keep['peptide'].unique()) df_25mers['pep_list'] = \ df_25mers['peptide'].apply(lambda x: [y for y in binders if y in x]) # associate each 25mer with list of alleles the peptides bind to df_25mers['allele_list'] = \ df_25mers['pep_list'].apply(lambda x: [pep_to_allele[y] for y in x]) df_25mers['alleles'] = \ df_25mers['allele_list'].apply(lambda x: [item for sublist in x for item in sublist]) df_25mers['alleles'] = df_25mers['alleles'].apply(lambda x: list(set(x))) # calculate coverage of that allele list for various populations df_25mers['coverage'] = \ df_25mers['alleles'].apply(lambda x: coverage(alleles=x, df_freq=df_freq, pop=pop)) df_25mers['USA_coverage'] = \ df_25mers['alleles'].apply(lambda x: coverage(alleles=x, df_freq=df_freq, pop='USA')) df_25mers['EUR_coverage'] = \ df_25mers['alleles'].apply(lambda x: coverage(alleles=x, df_freq=df_freq, pop='EUR')) df_25mers['API_coverage'] = \ df_25mers['alleles'].apply(lambda x: coverage(alleles=x, df_freq=df_freq, pop='API')) print('\tzeroing out coverage for pep lists shorter than %d...' % min_peps) df_25mers['npeps'] = df_25mers['pep_list'].apply(lambda x: len(x)) df_25mers.loc[df_25mers['npeps'] < min_peps, 'coverage'] = 0 df_25mers = df_25mers.sort_values('coverage', ascending=False) # save absolute ranking del df_25mers['allele_list'] if n_selected == 0: df_abs = df_25mers.copy() df_abs['pep_list'] = df_abs['pep_list'].apply(lambda x: ';'.join(x)) df_abs['alleles'] = df_abs['alleles'].apply(lambda x: ';'.join(x)) if min_peps is not None: df_abs = df_abs.loc[df_abs['npeps'] >= min_peps] df_abs = df_abs.loc[~df_abs['pep_list'].isnull()] df_abs.to_csv('covid_%dmers_ranked_by_coverage_%s.csv' % (L_longmer, pop), sep=',', index=False) df_25mers['all_alleles'] = df_25mers['alleles'].apply(lambda x: ';'.join(x)) # make next 25mer selection based on disjoint coverage next_pick = df_25mers.iloc[0:1] new_alleles = next_pick.iloc[0]['alleles'] next_pick['selection'] = n_selected + 1 # calculate cumulative coverage of all 25mers up to this point next_pick['covered_alleles'] = next_pick['alleles'].apply(lambda x: x + excluded_hla) next_pick['cumulative_coverage'] =\ next_pick['covered_alleles'].apply(lambda x: coverage(alleles=x, df_freq=df_freq, pop=pop)) next_pick['USA_cumulative_coverage'] =\ next_pick['covered_alleles'].apply(lambda x: coverage(alleles=x, df_freq=df_freq, pop='USA')) next_pick['EUR_cumulative_coverage'] =\ next_pick['covered_alleles'].apply(lambda x: coverage(alleles=x, df_freq=df_freq, pop='EUR')) next_pick['API_cumulative_coverage'] =\ next_pick['covered_alleles'].apply(lambda x: coverage(alleles=x, df_freq=df_freq, pop='API')) df_selected = df_selected.append(next_pick) n_selected += 1 excluded_hla += new_alleles df_selected['pep_list'] = df_selected['pep_list'].apply(lambda x: ';'.join(x)) df_selected['alleles'] = df_selected['alleles'].apply(lambda x: ';'.join(x)) df_selected.rename(columns={'covered_alleles': 'cumulative_alleles', 'alleles': 'additional_alleles', 'all_alleles': 'alleles'}, inplace=True) df_selected['cumulative_alleles'] = \ df_selected['cumulative_alleles'].apply(lambda x: ';'.join(x)) del df_selected['coverage'] del df_selected['cumulative_coverage'] del df_selected['npeps'] return df_selected, df_abs def hla_freq(pop_cols=None): """ Return HLA-II population frequencies for European (EUR), Asian Pacific Islander (API), and United States (USA) populations, as well as an average across the populations """ if pop_cols is None: pop_cols = ['USA', 'EUR', 'API'] hla_freq_file = # supplement hla-ii file df_hla = pd.read_csv(hla_freq_file, sep='\t') df_dr = df_hla.loc[df_hla['locus'].isin(['DRB1', 'DRB345'])] # impute missing allele frequencies with zeros df_hla.fillna(0, inplace=True) df_hla['AVG'] = df_hla[pop_cols].mean(axis=1) return df_hla def coverage(alleles=None, df_freq=None, pop='AVG'): """ Given a list of HLA-II alleles, calculate the population coverage of those alleles where coverage is defined as the probability that a person within that population has at least one of the alleles listed. Function takes the population as an input as well as (optionally) the population frequency table. See the methods of the paper expression we use to calculate coverage. """ # if no alleles provide, pop_coverage = 0 if alleles == []: return 0 # account for possibility of duplicates in list alleles = list(set(alleles)) # load frequency table if not provided if df_freq is None: df_freq = hla_freq() # dictionary {allele: frequency} freq_dict = pd.Series(df_freq[pop].values,index=df_freq['allele']).to_dict() # we are going to consider 4 loci, DRB1, DRB3/4/5, DP, DQ dr1_alleles = [x for x in alleles if 'DRB1' in x] bonus_alleles = [x for x in alleles if 'DR' in x and x not in dr1_alleles] dp_alleles = [x for x in alleles if 'DP' in x] dq_alleles = [x for x in alleles if 'DQ' in x] hla_product = 1 if len(dr1_alleles) > 0: probs = sum([freq_dict[x] for x in dr1_alleles]) hla_product = hla_product * ((1 - probs) ** 2) if len(bonus_alleles) > 0: probs = sum([freq_dict[x] for x in bonus_alleles]) hla_product = hla_product * ((1 - probs) ** 2) if len(dp_alleles) > 0: probs = sum([freq_dict[x] for x in dp_alleles]) hla_product = hla_product * ((1 - probs) ** 2) if len(dq_alleles) > 0: probs = sum([freq_dict[x] for x in dq_alleles]) hla_product = hla_product * ((1 - probs) ** 2) pop_coverage = 1 - hla_product return pop_coverage