{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Supplement 2: Main Effect Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Source code for prediction of COVID-19 test results. This is supplemental material to publication\n",
"\n",
"Wojtusiak J, Bagais W, Vang J, Guralnik E, Roess A, Alemi F, \"The Role of Symptom Clusters in Triage of COVID-19 Patients,\" Quality Management in Health Care, 2022.\n",
"\n",
"Source code by Wejdan Bagais and Jee Vang with contribution of other authors. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from models import select_attributes\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import timeit\n",
"\n",
"import pickle\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.metrics import roc_auc_score\n",
"import numpy as np\n",
"from joblib import Parallel, delayed\n",
"\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run LASSO Model for the 30 splits data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"start = timeit.default_timer()\n",
"\n",
"#list of inverse of regularization\n",
"c_list = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.5,2]\n",
"\n",
"split_ids = []\n",
"cs = []\n",
"source = []\n",
"AUCs = []\n",
"prec = []\n",
"rec = []\n",
"vars_cnt = []\n",
"vars_lists = []\n",
"ys_test = []\n",
"ys_pred = []\n",
"\n",
"# loop over the 30 split data\n",
"for i in range (0,30):\n",
" # read the data\n",
" tr_path = \"../data/30_splits_data/binary-transformed_tr_\"+str(i)+\".csv\"\n",
" ts_path = \"../data/30_splits_data/binary-transformed_ts_\"+str(i)+\".csv\"\n",
"\n",
" train = pd.read_csv(tr_path)\n",
" test = pd.read_csv(ts_path)\n",
" \n",
" XT = train.drop(columns=['TestPositive'])\n",
" Xt = test.drop(columns=['TestPositive'])\n",
" yT = train['TestPositive']\n",
" yt = test['TestPositive']\n",
" \n",
" # loop over inverse of regularization\n",
" for c in c_list:\n",
" # run the model\n",
" auc, recall, precision, valid_cols, yt, y_pred = select_attributes(XT, yT, Xt, yt,c)\n",
" \n",
" # save results to the list\n",
" split_ids.append(i)\n",
" cs.append(c)\n",
" source.append('no_cluster')\n",
" AUCs.append(auc)\n",
" prec.append(precision)\n",
" rec.append(recall)\n",
" vars_lists.append(valid_cols)\n",
" vars_cnt.append(len(valid_cols))\n",
" ys_test.append(yt.values.tolist())\n",
" ys_pred.append(y_pred)\n",
"\n",
" \n",
" print(f'ID {i}, C={c:.2f}, AUC={auc:.5f}, Precision={precision:.5f}, Recall={recall:.5f}, cls# {len(valid_cols)}') \n",
"\n",
"stop = timeit.default_timer()\n",
"print('Time: ', stop - start) "
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"# identify the list of unique selected predictors\n",
"unq_var = vars_lists.copy()\n",
"for i in range(0, len(unq_var)):\n",
" unq_var[i] = [sub.replace(' & ', ',') for sub in unq_var[i]]\n",
" \n",
"sympt_lists = []\n",
"for i in range(0, len(unq_var)):\n",
" l = \",\".join(unq_var[i])\n",
" l2 = list(set(l.split(',')))\n",
" sympt_lists.append(l2)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"ys_pred_l = []\n",
"for i in ys_pred:\n",
" ys_pred_l.append(list(i))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create df for the results\n",
"ff = pd.DataFrame({'split_ids' : split_ids,\n",
" 'cs' : cs,\n",
" 'source' : source,\n",
" 'AUCs' : AUCs,\n",
" 'prec' : prec,\n",
" 'rec' : rec,\n",
" 'vars_cnt' : vars_cnt,\n",
" 'vars_lists' : vars_lists,\n",
" 'y_test' : ys_test,\n",
" 'y_pred' : ys_pred_l\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# add list of unique predictors and its count\n",
"ff['sympt_lists'] = sympt_lists\n",
"ff['sympt_cnt'] = ff['sympt_lists'].apply(lambda x :len(x))"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"# save the results\n",
"ff.to_csv('../data/results/main_effect_model.csv', index =False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Identifying the best inverse of regularization strength value (C)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"# display average results bases on C\n",
"table = pd.pivot_table(ff, values=['AUCs', 'vars_cnt', 'sympt_cnt']\n",
" , index=['cs']\n",
" , aggfunc=np.mean).round(decimals=4)\n",
"\n",
"table['sympt_cnt'] = table['sympt_cnt'].round().astype(int)\n",
"table['vars_cnt'] = table['vars_cnt'].round().astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" AUCs | \n",
" sympt_cnt | \n",
" vars_cnt | \n",
"
\n",
" \n",
" cs | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0.1 | \n",
" 0.7779 | \n",
" 6 | \n",
" 6 | \n",
"
\n",
" \n",
" 0.2 | \n",
" 0.7843 | \n",
" 9 | \n",
" 9 | \n",
"
\n",
" \n",
" 0.3 | \n",
" 0.7843 | \n",
" 11 | \n",
" 11 | \n",
"
\n",
" \n",
" 0.4 | \n",
" 0.7819 | \n",
" 13 | \n",
" 13 | \n",
"
\n",
" \n",
" 0.5 | \n",
" 0.7789 | \n",
" 14 | \n",
" 14 | \n",
"
\n",
" \n",
" 0.6 | \n",
" 0.7751 | \n",
" 16 | \n",
" 16 | \n",
"
\n",
" \n",
" 0.7 | \n",
" 0.7740 | \n",
" 18 | \n",
" 18 | \n",
"
\n",
" \n",
" 0.8 | \n",
" 0.7728 | \n",
" 19 | \n",
" 19 | \n",
"
\n",
" \n",
" 0.9 | \n",
" 0.7722 | \n",
" 20 | \n",
" 20 | \n",
"
\n",
" \n",
" 1.0 | \n",
" 0.7703 | \n",
" 21 | \n",
" 21 | \n",
"
\n",
" \n",
" 1.5 | \n",
" 0.7607 | \n",
" 24 | \n",
" 24 | \n",
"
\n",
" \n",
" 2.0 | \n",
" 0.7554 | \n",
" 26 | \n",
" 26 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" AUCs sympt_cnt vars_cnt\n",
"cs \n",
"0.1 0.7779 6 6\n",
"0.2 0.7843 9 9\n",
"0.3 0.7843 11 11\n",
"0.4 0.7819 13 13\n",
"0.5 0.7789 14 14\n",
"0.6 0.7751 16 16\n",
"0.7 0.7740 18 18\n",
"0.8 0.7728 19 19\n",
"0.9 0.7722 20 20\n",
"1.0 0.7703 21 21\n",
"1.5 0.7607 24 24\n",
"2.0 0.7554 26 26"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"table"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.2"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"column = table[\"AUCs\"]\n",
"column.idxmax() # best C value based on AUC"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Build model based on the selected C and using all data to identify the list of predictors"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Read all data (original cleaned data)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"path = \"../data/preprocessed.csv\"\n",
"df = pd.read_csv(path)\n",
"df.columns = [s.replace('_',' ') for s in df.columns]"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"X = df.drop(['TestPositive'], axis=1)\n",
"y = df['TestPositive']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### k-fold cross-validation, k=24"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"Xy_pickle = 'BinaryDataX_no_cluster.p'\n",
"pickle.dump({'X': X, 'y': y}, open(Xy_pickle, 'wb'))"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"def do_validation(fold, tr, te, c= _C):\n",
" data = pickle.load(open(Xy_pickle, 'rb'))\n",
" X, y = data['X'], data['y']\n",
" \n",
" X_tr, X_te = X.iloc[tr], X.iloc[te]\n",
" y_tr, y_te = y.iloc[tr].values.ravel(), y.iloc[te].values.ravel()\n",
" \n",
" print(f'fold {fold:02}')\n",
" \n",
" regressor = LogisticRegression(penalty='l1', solver='saga', C=c, n_jobs=-1, max_iter=5000*2)\n",
" regressor.fit(X_tr, y_tr)\n",
" \n",
" y_pr = regressor.predict_proba(X_te)[:,1]\n",
" \n",
" score = roc_auc_score(y_te, y_pr)\n",
" print(f'fold {fold:02}, score={score:.5f}')\n",
" return score, regressor.coef_[0]\n",
" \n",
"\n",
"skf = StratifiedKFold(n_splits=24, shuffle=True, random_state=37)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"outputs = Parallel(n_jobs=-1)(delayed(do_validation)(fold, tr, te, _C) \n",
" for fold, (tr, te) in enumerate(skf.split(X, y)))"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"scores = pd.Series([score for score, _ in outputs])\n",
"coefs = pd.DataFrame([coef for _, coef in outputs], columns=X.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Coefficients from k-fold cross-validation that is consistent 95% of the time\n",
"- Consistent means in same direction and not absent."
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"def get_profile(df, col):\n",
" s = df[col]\n",
" \n",
" s_pos = s[s > 0]\n",
" s_neg = s[s < 0]\n",
" \n",
" n = df.shape[0]\n",
" p_pos = len(s_pos) / n\n",
" p_neg = len(s_neg) / n\n",
" \n",
" return {\n",
" 'field': col,\n",
" 'n_pos': len(s_pos),\n",
" 'n_neg': len(s_neg),\n",
" 'pct_pos': p_pos, \n",
" 'pct_neg': p_neg, \n",
" 'is_valid': 1 if p_pos >= 0.95 or p_neg >= 0.95 else 0\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"valid_coefs = pd.DataFrame([get_profile(coefs, c) for c in coefs.columns]).sort_values(['is_valid'], ascending=False)\n",
"valid_coefs = valid_coefs[valid_coefs.is_valid == 1]"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LogisticRegression(C=0.2, max_iter=10000, n_jobs=-1, penalty='l1',\n",
" random_state=37, solver='saga')"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid_cols = list(valid_coefs.field)\n",
"regressor = LogisticRegression(penalty='l1', solver='saga', C= _C, n_jobs=-1, \n",
" max_iter=5000*2, random_state=37)\n",
"regressor.fit(X[valid_cols], y.values.ravel())\n"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"y_pred = regressor.predict_proba(X[valid_cols])[:,1]\n",
"\n",
"t = X[valid_cols].copy()\n",
"t['y_pred'] = y_pred\n",
"t['y_actual'] = y\n",
"# save results\n",
"t.to_csv(\"../data/results/prediction_main_effect_model.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Visualize coefficients"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"c = pd.Series(regressor.coef_[0], valid_cols)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.style.use('ggplot')\n",
"\n",
"i = pd.Series([regressor.intercept_[0]], index=['intercept'])\n",
"s = pd.concat([c[c > 0], c[c < 0]]).sort_index()\n",
"s = pd.concat([i, s])\n",
"color = ['r' if v > 0 else 'b' for v in s]\n",
"\n",
"ax = s.plot(kind='bar', color=color, figsize=(20, 4))\n",
"_ = ax.set_title(f'Logistic Regression, validated auc={scores.mean():.5f}')"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"s_odds = np.exp(s)\n",
"color = ['r' if v > 1 else 'b' for v in s_odds]\n",
"\n",
"ax = s_odds.plot(kind='bar', color=color, figsize=(20, 4))\n",
"_ = ax.set_title(f'Logistic Regression, coefficient odds')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tabular output of coefficients with odds"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" coefficient | \n",
" coefficient_odds | \n",
"
\n",
" \n",
" \n",
" \n",
" intercept | \n",
" -2.2125 | \n",
" 0.1094 | \n",
"
\n",
" \n",
" Age 30 and over | \n",
" -0.2601 | \n",
" 0.7710 | \n",
"
\n",
" \n",
" Chest pain | \n",
" 0.6929 | \n",
" 1.9995 | \n",
"
\n",
" \n",
" Chills | \n",
" 0.3731 | \n",
" 1.4522 | \n",
"
\n",
" \n",
" Cough | \n",
" 0.4291 | \n",
" 1.5359 | \n",
"
\n",
" \n",
" Difficulty breathing | \n",
" 0.2009 | \n",
" 1.2225 | \n",
"
\n",
" \n",
" Headaches | \n",
" 1.0631 | \n",
" 2.8952 | \n",
"
\n",
" \n",
" Joint pain | \n",
" 0.3509 | \n",
" 1.4204 | \n",
"
\n",
" \n",
" Loss of appetite | \n",
" 0.2733 | \n",
" 1.3143 | \n",
"
\n",
" \n",
" Loss of smell | \n",
" 0.1747 | \n",
" 1.1909 | \n",
"
\n",
" \n",
" Loss of taste | \n",
" 0.3813 | \n",
" 1.4643 | \n",
"
\n",
" \n",
" Race White | \n",
" 0.1132 | \n",
" 1.1199 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" coefficient coefficient_odds\n",
"intercept -2.2125 0.1094\n",
"Age 30 and over -0.2601 0.7710\n",
"Chest pain 0.6929 1.9995\n",
"Chills 0.3731 1.4522\n",
"Cough 0.4291 1.5359\n",
"Difficulty breathing 0.2009 1.2225\n",
"Headaches 1.0631 2.8952\n",
"Joint pain 0.3509 1.4204\n",
"Loss of appetite 0.2733 1.3143\n",
"Loss of smell 0.1747 1.1909\n",
"Loss of taste 0.3813 1.4643\n",
"Race White 0.1132 1.1199"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame({\n",
" 'coefficient': s,\n",
" 'coefficient_odds': s_odds\n",
"}).round(decimals=4)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(12, 2)"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(pd.DataFrame({\n",
" 'coefficient': s,\n",
" 'coefficient_odds': s_odds\n",
"}).round(decimals=4)).shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.11"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}