Table 1.
Task | Code in R language |
---|---|
1) Data preparation | |
1.1) Import new dataset called „db “ | db <—read.csv (url ("https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data")) |
1.2) Label columns in „db” | Colname <—c("BIRADS", "Age", "Shape", "Margin", "Density", "outcome") |
1.3) Assign factor-levels for factors (categorial variables) and set reference category |
db$Margin <—as.factor (ifelse (db$Margin = = "1", "Circumscribed", ifelse (db$Margin = = "2","microbulated", ifelse (db$Margin = = "3", "Obscured", ifelse (db$Margin = = "4", "Ill-defined", "Spiculated"))))) db$Margin <—relevel (db$Margin, ref = "Circumscribed") |
1.4) Assign numeric status for numerical variables | db$Age <—as.numeric (db$Age) |
1.5) Replace unlogical age values as missing values |
AGE_upper <—120 AGE_lower <—0 db$Age <—ifelse (db$Age > = AGE_upper, NA, db$Age) db$Age <—ifelse (db$Age < = AGE_lower, NA, db$Age) |
1.6) Remove variables that have over 50% missing values |
missing_col <—colMeans (is.na (db)) remove < -vector() for(i in 1:length(missing_col)){ if(missing_col[i] > = 0.5){remove < -append(remove, names(missing_col[i]))}} if(!is.logical(remove)) db < -db % > % dplyr::select(-!!remove) |
1.7) Split dataset into development and validation sets |
train_index <—createDataPartition(db$outcome, p = .8, list = FALSE, times = 1) db_train <—db[train_index,] db_test <—db[-train_index,] |
1.8) Define recipe for data pre-processing |
recipe <—recipe (outcome ~ Age + Shape + Margin + Density, data = db_train) recipe <—recipe % > % step_knnimpute(all_predictors(), neighbors = 5) % > % step_BoxCox(all_numeric(),-all_outcomes()) % > % step_other(all_nominal(), threshold = .1, other = "other") step_zv(all_predictors(),-all_outcomes()) % > % step_nzv(all_predictors(),-all_outcomes())% > % step_normalize(all_numeric(),-all_outcomes())% > % step_dummy(all_nominal(),-all_outcomes()) % > % step_corr(all_predictors(),-all_outcomes(), threshold = 0.9) |
1.9) Show all steps in the recipe |
prep <—prep (recipe, db_train) tidy (prep) |
1.10) Examine changes of a specific pre-processing step (number 6 = normalize) | tidy (prep, number = 6) |
1.11) Examine pre-processed training data | prep[["template"]] |
2) Algorithm Development | |
2.1) Define multiple performance metrics for model training |
MySummary <—function (data, lev = NULL, model = NULL){ a1 <—defaultSummary(data, lev, model) b1 <—twoClassSummary(data, lev, model) c1 <—prSummary(data, lev, model) out <—c(a1, b1, c1) out} |
2.2) Define general training parameters for cross-validation and hypergrid-search |
cv <—trainControl (method = "repeatedcv", number = 10, repeats = 3, search = "grid", verboseIter = TRUE, classProbs = TRUE, returnResamp = "final", savePredictions = "final", summaryFunction = MySummary, selectionFunction = "tolerance", allowParallel = TRUE) |
2.3) Define general training parameters for cross-validation and random grid- search |
cv <—trainControl( method = "repeatedcv", number = 10, repeats = 3, search = "random", verboseIter = TRUE, classProbs = TRUE, returnResamp = "final", savePredictions = "final", summaryFunction = MySummary, selectionFunction = "tolerance", allowParallel = TRUE) |
2.4) Define general training parameters for adaptive resampling for hyperparameter tuing |
adaptControl <—trainControl( method = "adaptive_cv", number = 10, repeats = 3, adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE), search = "random", verboseIter = TRUE, classProbs = TRUE, returnResamp = "final", savePredictions = "final", summaryFunction = MySummary, selectionFunction = "tolerance", allowParallel = TRUE) |
2.5) Hypergrid for Logistic Regression with Elastic Net Penalty |
hyper_grid_glm <—expand.grid( alpha = seq(from = 0.01, to = 1, by = 0.01), lambda = seq(from = 0.01, to = 1, by = 0.01)) |
2.6) Hypergrid for XGBoost Tree |
hyper_grid_xgboost <—expand.grid( nrounds = seq(from = 25, to = 100, by = 25), max_depth = seq(from = 5, to = 35, by = 10), eta = seq(from = 0.2, to = 1, by = 0.2), gamma = seq(from = 1, to = 10, by = 1), colsample_bytree = seq(from = 0.6, to = 1, by = 0.2), min_child_weight = seq(from = 2, to = 5, by = 1), subsample = 1) |
2.7) Hypergrid for MARS algorithm |
hyper_grid_mars <—expand.grid( degree = seq(from = 1, to = 3, by = 1), nprune = seq(from = 1, to = 10, by = 1)) |
2.8) Hypergrid for SVM with polynomial kernel |
hyper_grid_svm <—expand.grid( degree = seq(from = 1, to = 11, by = 2), scale = seq(from = 0.1, to = 1, by = 0.1), C = seq(from = 0.5, to = 8, by = 0.5)) |
2.9) Hypergrid for multi-layer perceptron with dropout cost (deep neural network) |
hyper_grid_nn <—expand.grid( size = seq(from = 1, to = 21, by = 10), dropout = seq(from = 0.1, to = 0.3, by = 0.1), batch_size = seq(from = 1, to = 11, by = 5), lr = seq(from = 0.25, to = 1, by = 0.25), rho = seq(from = 0.25, to = 1, by = 0.25), decay = seq(from = 0.1, to = 0.5, by = 0.2), cost = seq(from = 0.25, to = 1, by = 0.25), activation = 'relu') |
2.10) Train Logistic Regression with Elastic Net Penalty (hypergrid search) |
cv_glm <—caret::train(recipe, data = db_train, method = "glmnet", metric = "Kappa", trControl = cv, tuneGrid = hyper_grid_glm) |
2.11) Train XGBoost Tree (hypergrid search) |
cv_xgboost <—caret::train(recipe, data = db_train, method = "xgbTree", metric = "Kappa", trControl = cv, tuneGrid = hyper_grid_xgboost) |
2.12) Train MARS algorithm (hypergrid search) |
cv_mars <—caret::train(recipe, data = db_train, method = "earth", metric = "Kappa", trControl = cv, tuneGrid = hyper_grid_mars) |
2.13) Train SVM with polynomial kernel (random grid search) |
cv_svm <—caret::train(recipe, data = db_train, method = "svmPoly", metric = "Kappa", tuneLength = 30, trControl = cv_svm) |
2.14) Train multi-layer perceptron with dropout cost (deep neural network) (random grid search) |
cv_nn <—caret::train(recipe, data = db_train, method = "mlpKerasDropoutCost", metric = "Kappa", tuneLength = 30, trControl = cv_nn) |
3) Internal Testing | |
3.1) Show final model with optimal hyperparamters for Logistic Regression with Elastic Net Penalty | cv_glm$bestTune |
3.2) Show internal testing results for final model with optimal hyperparamters (Logistic Regression with Elastic Net Penalty) | cv_glm$results[c(#bestTune),] |
4) (External) Validation | |
4.1) Use trained model to predict outcome probabilities in the validation set | predict (cv_glm, db_test, type = "prob") |
4.2) Use trained model to predict outcome classes in the validation set | predict (cv_glm, db_test) |
4.3) Calculate area under the ROC curve for outcome predictions in the validation set |
roc_glm_validation = roc (as.vector (db_test$outcome), as.matrix (predict (cv_glm, db_test, type = "prob")$"Malignant")) auc_glm_validation = pROC::auc (roc_glm_validation) auc_CI_glm_validation = pROC::ci.auc (roc_glm_validation, method = "bootstrap", boot.stratified = TRUE) |
4.4) Plot ROC curves |
plot.roc (roc_glm_validation, legacy.axes = TRUE) lines (roc_glm_validation, col = "blue") lines (roc_xgboost_validation, col = "red") lines (roc_mars_validation, col = "orange") lines (roc_svm_validation, col = "black") lines (roc_nn_validation, col = "grey60") legend ("bottomright", legend = c("LR with Elastic Net Penalty", "XGBoost Tree", "MARS", "SVM", "neural network"), col = c("blue", "red", "orange", "black", "grey60")) |
4.5) Create confusion matrix | confusionMatrix (as.factor (predict (cv_glm, db_test)), factor (db_test$outcome), positive = "Malignant") |
4.6) Create calibration plot |
glm_calplot_validation <—calibration(factor (db_test$outcome) ~ as.matrix (predict (cv_glm, db_test, type = "prob")$"Malignant"), data = db_test, cuts = 10) xyplot(glm_calplot_validation, auto.key = list(columns = 2)) |
4.7) Calculate Spiegelhalter’s Z score to assess model calibration |
Spiegelhalter_z = function(y, prob){ alpha = 0.05 z_score = sum((y-prob)*(1–2*prob))/sqrt(sum(((1–2*prob)^2)*prob*(1-prob))) print(z_score) if (abs(z_score) > qnorm(1-alpha/2)){ print('reject null. NOT calibrated') } else{ print('fail to reject. calibrated') } cat('z score: ', z_score, '\n') cat('p value: ', 1-pnorm(abs(z_score)), '\n') return(z_score)} Spiegelhalter_z (unfactor(revalue(db_test$outcome, c("Malignant" = 1, "Benign" = 0))), as.matrix(predict(cv_glm, db_test, type = "prob")$"Malignant")) |
4.8) Test for differences in area under the curve between two algorithms | roc.test (roc_glm_validation, roc_mars_validation, method = "bootstrap", alternative = "two.sided", boot.n = 2000, boot.stratified = TRUE) |
4.9) Test for differences in diagnostic performance between two algorithms using a McNemar test | mcnemar.test (predict (cv_glm, db_test), predict(cv_mars, db_test), correct = TRUE) |