"""Pseudocode description of RNN models for predicting CIAKI""" ##### Dataset information ##### import numpy as np import pandas as pd # The datasets used and analyzed during the current study available from the corresponding author on reasonable request. # features: time-invariant features, pandas dataframe with (14185, 52) shape # >>> 51 features used for modeling, 1 feature for patient ID (named 'ID') # features_rnn: time-varying features, numpy object with (14185, 16, 4) shape # >>> The 2nd axis consists of 16 consecutive time varying features in order of getting closer to the CT scan date from the past. # >>> The 3rd axis consists of 'ID', creatinine, eGFR, and elapsed days. # target: pandas series for true label (=CIAKI), coded as 0/1 # name_factors: list object for factor features ##### Dataset preprocessing (missing values & scaling) ##### from sklearn.impute import KNNImputer from sklearn.preprocessing import StandardScaler def impute(X_train, X_test, num=8): # missing value imputation by KNN imputation imputer = KNNImputer(n_neighbors = num, weights = 'uniform') fillna_variables = X_train.drop(['ID'] + name_factors, axis=1).columns.tolist() imputer.fit(X_train[fillna_variables]) X_train[fillna_variables] = imputer.transform(X_train[fillna_variables]) X_test[fillna_variables] = imputer.transform(X_test[fillna_variables]) return X_train, X_test def scale(X_train, X_test): # time-invarinat feature scaling without ID_variables and factor features scaled_variables = X_train.drop(['ID'] + name_factors, axis=1).columns.tolist() scaler = StandardScaler().fit(X_train[scaled_variables]) X_train[scaled_variables] = scaler.transform(X_train[scaled_variables]) X_test[scaled_variables] = scaler.transform(X_test[scaled_variables]) return X_train, X_test def ID_remover(X_train): # ID_variables (Order, ID) drop X_train = X_train.drop('ID', axis=1) return X_train def scale_rnn(X_train_rnn, X_test_rnn, modifier=365): # time-varying feature scaling timestamp = X_train_rnn.shape[1] ### Creatinine, GFR scaling # Creatinine and eGFR and elapsed days were scaled based on the latest results. # They were simply scaled by StandardScaler. scaler = StandardScaler().fit(X_train_rnn[:,-1,:].reshape(-1, 4)[:,1:3]) X_train_rnn_temp = scaler.transform(X_train_rnn[:,:,:].reshape(-1, 4)[:,1:3]) X_test_rnn_temp = scaler.transform(X_test_rnn[:,:,:].reshape(-1, 4)[:,1:3]) X_train_rnn[:,:,1:3] = X_train_rnn_temp.reshape(-1,timestamp,2) X_test_rnn[:,:,1:3] = X_test_rnn_temp.reshape(-1,timestamp,2) ### Elapsed days scaling # Elapsed days were scaled into 0-1 values. # Through this scaling, the more recently measured value has a value closer to 1. X_train_rnn[:,:,3] = modifier / (modifier+X_train_rnn[:,:,3]) X_test_rnn[:,:,3] = modifier / (modifier+X_test_rnn[:,:,3]) return X_train_rnn, X_test_rnn def ID_remover_rnn(X_train_rnn): X_train_rnn = X_train_rnn[:,:,1:] return X_train_rnn def fillna_rnn(X_train_rnn): # missing values in time-varying feature were masked with masking value (set to 100) X_train_rnn[pd.isnull(X_train_rnn)] = 100 return X_train_rnn ##### RNN model structure ##### import tensorflow as tf from tensorflow.keras.models import Model, load_model from tensorflow.keras.layers import Dense, LSTM, Masking, Concatenate, Input, BatchNormalization, Dropout, ReLU from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint from sklearn.metrics import roc_auc_score import tensorflow.keras.backend as K import random as rn import os def rnn_maker(mlp_shape, rnn_shape): rnn_input=Input(shape=(rnn_shape,3), name='RNN') # creatinine x1=Masking(mask_value=100.0)(rnn_input) x1=LSTM(256, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)(x1); x1=LSTM(256, dropout=0.5, recurrent_dropout=0.5)(x1); x1=Dense(256)(x1); x1=BatchNormalization()(x1); x1=ReLU()(x1); x1=Dropout(0.5)(x1) mlp_input=Input(shape=(mlp_shape,), name='MLP') x2=Dense(1024)(mlp_input); x2=ReLU()(x2); x2=Dropout(0.5)(x2) x2=Dense(1024)(x2); x2=ReLU()(x2); x2=Dropout(0.5)(x2) x2=Dense(1024)(x2); x2=BatchNormalization()(x2); x2=ReLU()(x2); x2=Dropout(0.5)(x2) x=Concatenate()([x1, x2]); x=Dense(1024)(x); x=BatchNormalization()(x); x=ReLU()(x); x=Dropout(0.5)(x) x=Dense(1024)(x); x=ReLU()(x); x=Dropout(0.5)(x) x=Dense(1024)(x); x=BatchNormalization()(x); x=ReLU()(x); x=Dropout(0.5)(x) x=Dense(1024)(x); x=ReLU()(x); x=Dropout(0.5)(x) x=Dense(1, activation='sigmoid', name='Out')(x) model=Model(inputs=[rnn_input, mlp_input], outputs=x) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC']) return model def rnn_learner(kfold, fgt_train, fgt_RNN_train, tgt_train, fgt_val, fgt_RNN_val, tgt_val): # Random seed was set for repeated reproducibility of the results. K.clear_session() os.environ['PYTHONHASHSEED'] = '0' np.random.seed(0) rn.seed(0) tf.random.set_seed(0) os.environ['TF_DETERMINISTIC_OPS'] = '1' # Because prevalence of CIAKI was 1.8% in the current study, # class_weight was set as a reciprocal of CIAKI prevalence. label = tgt_train.sum() total = tgt_train.shape[0] non_label = total - label ratio = non_label/label class_weight={0:1, 1:ratio} model = rnn_maker(fgt_train.shape[1], fgt_RNN_train.shape[1]) # temporary path to save RNN models path = 'C:/Users/dacty/temporary_h5/' file_path = path+'temp_rnn_kfold_'+str(kfold)+'.h5' es = EarlyStopping(monitor='val_loss', patience=5, verbose=1) mc = ModelCheckpoint(file_path, monitor='val_auc', mode='max', save_best_only=True) # model fit model.fit([fgt_RNN_train, fgt_train], tgt_train, epochs=50,batch_size=256, validation_data=([fgt_RNN_val, fgt_val], tgt_val), class_weight=class_weight, callbacks=[es, mc]) val_RNN=model.predict([fgt_RNN_val, fgt_val]) val_auroc = roc_auc_score(tgt_val, val_RNN) return file_path, val_auroc ##### Model training ##### from sklearn.model_selection import GroupShuffleSplit from sklearn.model_selection import GroupKFold # Random seed was set for repeated reproducibility of the results. # patient_split for train/test splitting by patients patient_split = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=0) # patient_skf for K-fold cross validation patient_skf = GroupKFold(n_splits=10) def kfold_learner(features, features_rnn, target): # Training(+validation) set and test set were splitted by patients. for train_val, test in patient_split.split(features, target, groups = features['ID']): X_train_val, y_train_val = features.loc[train_val].reset_index(drop=True), target.loc[train_val].reset_index(drop=True) X_test, y_test = features.loc[test].reset_index(drop=True), target.loc[test].reset_index(drop=True) # Training set and test set were imputed based on the information of the training set. # Time-invariant features X_train_val, X_test = impute(X_train_val, X_test) X_train_val, X_test = scale(X_train_val, X_test) X_test = ID_remover(X_test) # Time-varying features X_train_val_rnn, X_test_rnn = features_rnn[train_val,:,:], features_rnn[test,:,:] X_train_val_rnn, X_test_rnn = scale_rnn(X_train_val_rnn, X_test_rnn) X_train_val_rnn, X_test_rnn = fillna_rnn(X_train_val_rnn), fillna_rnn(X_test_rnn) X_test_rnn = ID_remover_rnn(X_test_rnn) # K-fold cross-validation fold = 0 list_file_path = [] list_val_auroc = [] for train, val in patient_skf.split(X_train_val, y_train_val, groups = X_train_val['ID']): X_train, y_train = X_train_val.loc[train], y_train_val.loc[train] X_val, y_val = X_train_val.loc[val], y_train_val.loc[val] X_train_rnn, X_val_rnn = X_train_val_rnn[train,:,:], X_train_val_rnn[val,:,:] X_train = ID_remover(X_train); X_val = ID_remover(X_val) X_train_rnn = ID_remover_rnn(X_train_rnn); X_val_rnn = ID_remover_rnn(X_val_rnn) file_path, val_auroc = rnn_learner(fold, X_train, X_train_rnn, y_train, X_val, X_val_rnn, y_val) list_file_path.append(file_path) list_val_auroc.append(val_auroc) fold += 1 max_num = np.argmax(list_val_auroc) final_RNN_model = load_model(list_file_path[max_num]) final_RNN_result = final_RNN_model.predict([X_test_rnn, X_test]) final_RNN_AUROC = roc_auc_score(y_test, final_RNN_result.reshape(-1)) return final_RNN_model, final_RNN_result, final_RNN_AUROC RNN_model, RNN_result, RNN_AUROC = kfold_learner(features, features_rnn, target)