{"cells":[{"metadata":{"papermill":{"duration":0.018459,"end_time":"2020-09-21T09:16:41.239793","exception":false,"start_time":"2020-09-21T09:16:41.221334","status":"completed"},"tags":[]},"cell_type":"markdown","source":"This notebook will demonstrate feature engineering and augmentation for the GRU/LSTM model.\n\nGRU/LSTM part is mainly based on [OpenVaccine: Simple GRU Model](https://www.kaggle.com/xhlulu/openvaccine-simple-gru-model).\n"},{"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2020-09-21T09:16:41.283808Z","iopub.status.busy":"2020-09-21T09:16:41.282968Z","iopub.status.idle":"2020-09-21T09:16:49.375397Z","shell.execute_reply":"2020-09-21T09:16:49.37468Z"},"papermill":{"duration":8.118687,"end_time":"2020-09-21T09:16:49.375551","exception":false,"start_time":"2020-09-21T09:16:41.256864","status":"completed"},"tags":[],"trusted":true},"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nimport json\nimport tensorflow.keras.layers as L\nimport keras.backend as K\nimport tensorflow as tf\nimport plotly.express as px\nfrom sklearn.model_selection import StratifiedKFold, KFold, GroupKFold\nfrom sklearn.cluster import KMeans\nimport os\n\nos.environ['CUDA_VISIBLE_DEVICES'] = '0'\ndef allocate_gpu_memory(gpu_number=0):\n physical_devices = tf.config.experimental.list_physical_devices('GPU')\n\n if physical_devices:\n try:\n print(\"Found {} GPU(s)\".format(len(physical_devices)))\n tf.config.set_visible_devices(physical_devices[gpu_number], 'GPU')\n tf.config.experimental.set_memory_growth(physical_devices[gpu_number], True)\n print(\"#{} GPU memory is allocated\".format(gpu_number))\n except RuntimeError as e:\n print(e)\n else:\n print(\"Not enough GPU hardware devices available\")\nallocate_gpu_memory()\n\nVer='GRU_LSTM1'\naug_data = '../input/openvaccine-augmentation-data/aug_data1.csv'\ndebug = False","execution_count":12,"outputs":[{"output_type":"stream","text":"Found 1 GPU(s)\n#0 GPU memory is allocated\n","name":"stdout"}]},{"metadata":{"execution":{"iopub.execute_input":"2020-09-21T09:16:49.43636Z","iopub.status.busy":"2020-09-21T09:16:49.435576Z","iopub.status.idle":"2020-09-21T09:16:49.439099Z","shell.execute_reply":"2020-09-21T09:16:49.439656Z"},"papermill":{"duration":0.044,"end_time":"2020-09-21T09:16:49.439787","exception":false,"start_time":"2020-09-21T09:16:49.395787","status":"completed"},"tags":[],"trusted":true},"cell_type":"code","source":"def gru_layer(hidden_dim, dropout):\n return L.Bidirectional(L.GRU(hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer = 'orthogonal'))\n\ndef lstm_layer(hidden_dim, dropout):\n return L.Bidirectional(L.LSTM(hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer = 'orthogonal'))\n\ndef build_model(seq_len=107, pred_len=68, dropout=0.5, embed_dim=100, hidden_dim=256, type=0):\n inputs = L.Input(shape=(seq_len, 6))\n \n # split categorical and numerical features and concatenate them later.\n categorical_feat_dim = 3\n categorical_fea = inputs[:, :, :categorical_feat_dim]\n numerical_fea = inputs[:, :, 3:]\n\n embed = L.Embedding(input_dim=len(token2int), output_dim=embed_dim)(categorical_fea)\n reshaped = tf.reshape(embed, shape=(-1, embed.shape[1], embed.shape[2] * embed.shape[3]))\n reshaped = L.concatenate([reshaped, numerical_fea], axis=2)\n \n if type == 0:\n hidden = gru_layer(hidden_dim, dropout)(reshaped)\n hidden = gru_layer(hidden_dim, dropout)(hidden)\n hidden = gru_layer(hidden_dim, dropout)(hidden)\n elif type == 1:\n hidden = gru_layer(hidden_dim, dropout)(reshaped)\n hidden = gru_layer(hidden_dim, dropout)(hidden)\n hidden = lstm_layer(hidden_dim, dropout)(hidden)\n #elif type == 2:\n # hidden = gru_layer(hidden_dim, dropout)(reshaped)\n # hidden = lstm_layer(hidden_dim, dropout)(hidden)\n # hidden = gru_layer(hidden_dim, dropout)(hidden)\n elif type == 2:\n hidden = lstm_layer(hidden_dim, dropout)(reshaped)\n hidden = lstm_layer(hidden_dim, dropout)(hidden)\n hidden = lstm_layer(hidden_dim, dropout)(hidden)\n \n truncated = hidden[:, :pred_len]\n out = L.Dense(5, activation='linear')(truncated)\n model = tf.keras.Model(inputs=inputs, outputs=out)\n model.compile(tf.keras.optimizers.Adam(), loss=mcrmse)\n return model","execution_count":13,"outputs":[]},{"metadata":{"execution":{"iopub.execute_input":"2020-09-21T09:16:49.489942Z","iopub.status.busy":"2020-09-21T09:16:49.488971Z","iopub.status.idle":"2020-09-21T09:16:49.492271Z","shell.execute_reply":"2020-09-21T09:16:49.492951Z"},"papermill":{"duration":0.035182,"end_time":"2020-09-21T09:16:49.493068","exception":false,"start_time":"2020-09-21T09:16:49.457886","status":"completed"},"tags":[],"trusted":true},"cell_type":"code","source":"token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}\npred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']\n\ndef preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):\n base_fea = np.transpose(\n np.array(\n df[cols]\n .applymap(lambda seq: [token2int[x] for x in seq])\n .values\n .tolist()\n ),\n (0, 2, 1)\n )\n bpps_sum_fea = np.array(df['bpps_sum'].to_list())[:,:,np.newaxis]\n bpps_max_fea = np.array(df['bpps_max'].to_list())[:,:,np.newaxis]\n bpps_nb_fea = np.array(df['bpps_nb'].to_list())[:,:,np.newaxis]\n return np.concatenate([base_fea,bpps_sum_fea,bpps_max_fea,bpps_nb_fea], 2)\n\ndef rmse(y_actual, y_pred):\n mse = tf.keras.losses.mean_squared_error(y_actual, y_pred)\n return K.sqrt(mse)\n\ndef mcrmse(y_actual, y_pred, num_scored=len(pred_cols)):\n score = 0\n for i in range(num_scored):\n score += rmse(y_actual[:, :, i], y_pred[:, :, i]) / num_scored\n return score","execution_count":14,"outputs":[]},{"metadata":{"papermill":{"duration":0.017475,"end_time":"2020-09-21T09:16:49.527343","exception":false,"start_time":"2020-09-21T09:16:49.509868","status":"completed"},"tags":[]},"cell_type":"markdown","source":"## Load and preprocess data"},{"metadata":{"execution":{"iopub.execute_input":"2020-09-21T09:16:49.566559Z","iopub.status.busy":"2020-09-21T09:16:49.565762Z","iopub.status.idle":"2020-09-21T09:16:50.291832Z","shell.execute_reply":"2020-09-21T09:16:50.290255Z"},"lines_to_next_cell":2,"papermill":{"duration":0.747182,"end_time":"2020-09-21T09:16:50.291948","exception":false,"start_time":"2020-09-21T09:16:49.544766","status":"completed"},"tags":[],"trusted":true},"cell_type":"code","source":"train = pd.read_json('../input/stanford-covid-vaccine/train.json', lines=True)\ntest = pd.read_json('../input/stanford-covid-vaccine/test.json', lines=True)","execution_count":15,"outputs":[]},{"metadata":{"execution":{"iopub.execute_input":"2020-09-21T09:16:50.34417Z","iopub.status.busy":"2020-09-21T09:16:50.334188Z","iopub.status.idle":"2020-09-21T09:17:11.178477Z","shell.execute_reply":"2020-09-21T09:17:11.179708Z"},"papermill":{"duration":20.870377,"end_time":"2020-09-21T09:17:11.17996","exception":false,"start_time":"2020-09-21T09:16:50.309583","status":"completed"},"tags":[],"trusted":true},"cell_type":"code","source":"# additional features\n\ndef read_bpps_sum(df):\n bpps_arr = []\n for mol_id in df.id.to_list():\n bpps_arr.append(np.load(f\"../input/stanford-covid-vaccine/bpps/{mol_id}.npy\").max(axis=1))\n return bpps_arr\n\ndef read_bpps_max(df):\n bpps_arr = []\n for mol_id in df.id.to_list():\n bpps_arr.append(np.load(f\"../input/stanford-covid-vaccine/bpps/{mol_id}.npy\").sum(axis=1))\n return bpps_arr\n\ndef read_bpps_nb(df):\n # normalized non-zero number\n # from https://www.kaggle.com/symyksr/openvaccine-deepergcn \n bpps_nb_mean = 0.077522 # mean of bpps_nb across all training data\n bpps_nb_std = 0.08914 # std of bpps_nb across all training data\n bpps_arr = []\n for mol_id in df.id.to_list():\n bpps = np.load(f\"../input/stanford-covid-vaccine/bpps/{mol_id}.npy\")\n bpps_nb = (bpps > 0).sum(axis=0) / bpps.shape[0]\n bpps_nb = (bpps_nb - bpps_nb_mean) / bpps_nb_std\n bpps_arr.append(bpps_nb)\n return bpps_arr \n\ntrain['bpps_sum'] = read_bpps_sum(train)\ntest['bpps_sum'] = read_bpps_sum(test)\ntrain['bpps_max'] = read_bpps_max(train)\ntest['bpps_max'] = read_bpps_max(test)\ntrain['bpps_nb'] = read_bpps_nb(train)\ntest['bpps_nb'] = read_bpps_nb(test)","execution_count":16,"outputs":[]},{"metadata":{"execution":{"iopub.execute_input":"2020-09-21T09:17:11.242395Z","iopub.status.busy":"2020-09-21T09:17:11.241357Z","iopub.status.idle":"2020-09-21T09:17:14.830513Z","shell.execute_reply":"2020-09-21T09:17:14.831555Z"},"papermill":{"duration":3.626176,"end_time":"2020-09-21T09:17:14.831719","exception":false,"start_time":"2020-09-21T09:17:11.205543","status":"completed"},"tags":[],"trusted":true},"cell_type":"code","source":"# clustering for GroupKFold\n# expecting more accurate CV by putting similar RNAs into the same fold.\nkmeans_model = KMeans(n_clusters=200, random_state=110).fit(preprocess_inputs(train)[:,:,0])\ntrain['cluster_id'] = kmeans_model.labels_","execution_count":17,"outputs":[]},{"metadata":{"papermill":{"duration":0.020176,"end_time":"2020-09-21T09:17:14.872725","exception":false,"start_time":"2020-09-21T09:17:14.852549","status":"completed"},"tags":[]},"cell_type":"markdown","source":"## Data augmentation for training and TTA(test)"},{"metadata":{"execution":{"iopub.execute_input":"2020-09-21T09:17:14.936809Z","iopub.status.busy":"2020-09-21T09:17:14.935969Z","iopub.status.idle":"2020-09-21T09:17:15.021897Z","shell.execute_reply":"2020-09-21T09:17:15.023027Z"},"papermill":{"duration":0.120191,"end_time":"2020-09-21T09:17:15.023193","exception":false,"start_time":"2020-09-21T09:17:14.903002","status":"completed"},"tags":[],"trusted":true},"cell_type":"code","source":"aug_data = '../input/openvaccineaugmentationdata/aug_data1.csv'\naug_df = pd.read_csv(aug_data)\ndisplay(aug_df.head())","execution_count":18,"outputs":[{"output_type":"display_data","data":{"text/plain":" id sequence \\\n0 id_fff546103 GGAAAGCUAGGACGUGGGAGCGUAGCUCUCCACACGGGUACGCCAA... \n1 id_18ff9d670 GGAAAGAGCUCGUGAGAAGAAUCUAGUACAUGCAUACGCUACAUCU... \n2 id_177cd630b GGAAAGAAGUAGCACGGUCCUAAGGUUACUGUAGCUAUGUCCAGCG... \n3 id_17a9ad5b7 GGAAAACACUGCAAAAGUCAACGAAGAAGUUGACUAAGAAGUGAUC... \n4 id_17ab91518 GGAAAACGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCG... \n\n structure log_gamma score \\\n0 .....((((((((((((((((...)))).)))).((((((((((..... 2 0.981885 \n1 .....(((.((.......((..((((.....((....)).....))... 0 0.887485 \n2 (....((.((((((((((((...))..))))).))))).))(((.(... 2 0.923722 \n3 ......((((((...(((((((......))))))).....((((((... 2 0.977602 \n4 ......(((((((((((((((((((((((((((((....)))))))... 2 0.982851 \n\n cnt predicted_loop_type \n0 3 EEEEESSSSSSSSSSSSSSSSHHHSSSSBSSSSMSSSSSSSSSSHH... \n1 5 EEEEESSSISSIIIIIIISSIISSSSIIIIISSHHHHSSIIIIISS... \n2 3 SMMMMSSISSSSSSSSSSSSHHHSSBBSSSSSBSSSSSISSSSSIS... \n3 3 EEEEEESSSSSSMMMSSSSSSSHHHHHHSSSSSSSMMMMMSSSSSS... \n4 3 EEEEEESSSSSSSSSSSSSSSSSSSSSSSSSSSSSHHHHSSSSSSS... ","text/html":"
\n | id | \nsequence | \nstructure | \nlog_gamma | \nscore | \ncnt | \npredicted_loop_type | \n
---|---|---|---|---|---|---|---|
0 | \nid_fff546103 | \nGGAAAGCUAGGACGUGGGAGCGUAGCUCUCCACACGGGUACGCCAA... | \n.....((((((((((((((((...)))).)))).((((((((((..... | \n2 | \n0.981885 | \n3 | \nEEEEESSSSSSSSSSSSSSSSHHHSSSSBSSSSMSSSSSSSSSSHH... | \n
1 | \nid_18ff9d670 | \nGGAAAGAGCUCGUGAGAAGAAUCUAGUACAUGCAUACGCUACAUCU... | \n.....(((.((.......((..((((.....((....)).....))... | \n0 | \n0.887485 | \n5 | \nEEEEESSSISSIIIIIIISSIISSSSIIIIISSHHHHSSIIIIISS... | \n
2 | \nid_177cd630b | \nGGAAAGAAGUAGCACGGUCCUAAGGUUACUGUAGCUAUGUCCAGCG... | \n(....((.((((((((((((...))..))))).))))).))(((.(... | \n2 | \n0.923722 | \n3 | \nSMMMMSSISSSSSSSSSSSSHHHSSBBSSSSSBSSSSSISSSSSIS... | \n
3 | \nid_17a9ad5b7 | \nGGAAAACACUGCAAAAGUCAACGAAGAAGUUGACUAAGAAGUGAUC... | \n......((((((...(((((((......))))))).....((((((... | \n2 | \n0.977602 | \n3 | \nEEEEEESSSSSSMMMSSSSSSSHHHHHHSSSSSSSMMMMMSSSSSS... | \n
4 | \nid_17ab91518 | \nGGAAAACGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCG... | \n......(((((((((((((((((((((((((((((....)))))))... | \n2 | \n0.982851 | \n3 | \nEEEEEESSSSSSSSSSSSSSSSSSSSSSSSSSSSSHHHHSSSSSSS... | \n