setwd(r"(P:\Diagnostik\Teams\Research Diagnostics\Data Analysis\GBM_cohort_matching\scripts)") library('readxl') library('tidyverse') library('MatchIt') library('survival') library('survminer') library('writexl') # read in processed public datasets gbm_datasets <- list( glass = read.table('../GBM datasets/publications/glass_2019/glass_clinical.txt', header = T, stringsAsFactors = F, sep = '\t'), tcga = read.table('../GBM datasets/publications/tcga_2013/tcga_clinical.txt', header = T, stringsAsFactors = F, sep = '\t' ), mskcc = read.table('../GBM datasets/publications/mskcc_2019/mskcc_clinical.txt', header = T, stringsAsFactors = F, sep = '\t'), lakomy = read.table('../GBM datasets/publications/lakomy_2020/lacomy_clinical.txt', header = T, stringsAsFactors = F, sep = '\t') ) # merge 4 datasets, select common variables for matching public_datasets <- do.call(rbind, lapply(gbm_datasets, function(x) subset(x, select = c(age_at_diagnosis, sex, MGMT, survival_months, vital_status, received_radiotherapy, received_tmz, concurrent_tmz, publication)))) public_datasets <- public_datasets[complete.cases(public_datasets), ] public_datasets$sex <- str_to_title(public_datasets$sex) public_datasets$vital_status <- str_to_title(public_datasets$vital_status) public_datasets$id <- paste0(1:nrow(public_datasets), "_", public_datasets$publication) table(public_datasets$publication) # read in data table and annotate SOC variable, and MGMT status gbm_table <- read_xlsx('../Praxis_datasets/230926_augmented.xlsx') gbm_table$publication <- "Praxis" # process MGMT data gbm_table$marker_mgmt_methylated <- recode(gbm_table$marker_mgmt_methylated, hypermeth = "meth", indeterminate = "unknown", meth. = "meth", na = "unknown", `NA`= "unknown", `na, VBO fragt nach` = "unknown", `na, vbo nachgefordert` = "unknown") gbm_table$marker_mgmt_methylated <- gsub("unknown", NA, gbm_table$marker_mgmt_methylated ) # read in calculated data table from co-author df_praxis <- readRDS('../Praxis_datasets/df_praxis.RDS') df_praxis <- df_praxis[match(gbm_table$id, df_praxis$id), ] gbm_table$age_at_diagnosis <- df_praxis$age_at_first_diagnosis gbm_table$survival_months <- df_praxis$survival_to_followup_months gbm_table$has_died <- df_praxis$has_died gbm_table$time_to_vaccination_months <- df_praxis$time_to_vaccination_months gbm_table_matching <- gbm_table %>% mutate(sex = recode(sex, f = "Female", m = "Male"), vital_status = recode(as.numeric(has_died), `1` = "Dead", `0` = "Alive"), MGMT = marker_mgmt_methylated) %>% filter(age_at_diagnosis > 18 & !is.na(MGMT)) %>% select(c(id, names(public_datasets), time_to_vaccination_months)) # ====================================================================== # select only patients who survived longer than median TTV for matching median_TTV <- median(gbm_table_matching$time_to_vaccination_months) public_datasets <- subset(public_datasets, survival_months > median_TTV) gbm_table_matching$time_to_vaccination_months <- NULL # ======================================================================= # sensor patients at 60 months for both groups. Few patients have follow up data after 60 months. cutoff <- 60 public_long_followups <- which(public_datasets$survival_months > cutoff) if(length(public_long_followups) > 0) { public_datasets$survival_months[public_long_followups] <- cutoff public_datasets$vital_status[public_long_followups] <- "Alive" } praxis_long_followups <- which(gbm_table_matching$survival_months > cutoff) if(length(praxis_long_followups) > 0) { gbm_table_matching$survival_months[praxis_long_followups] <- cutoff gbm_table_matching$vital_status[praxis_long_followups] <- "Alive" } # combine both datasets for matching df_combined <- rbind(gbm_table_matching, public_datasets) df_combined$received_tmz <- factor(df_combined$received_tmz, levels = c(0, 1), labels = c("No", "Yes")) df_combined$concurrent_tmz <- factor(df_combined$concurrent_tmz, levels = c(0, 1), labels = c("No", "Yes")) df_combined$received_radiotherapy <- factor(df_combined$received_radiotherapy, levels = c(0, 1), labels = c("No", "Yes")) df_combined$group <- c(rep(1, nrow(gbm_table_matching)), rep(0, nrow(public_datasets))) write_xlsx(subset(df_combined, publication == "GLASS")[,-c(1, 11)], path = '../GBM paper/supplementary data/supp_table4_GLASS_dataset.xlsx') write_xlsx(subset(df_combined, publication == "Lakomy")[,-c(1, 11)], path = '../GBM paper/supplementary data/supp_table4_Lakomy_dataset.xlsx') write_xlsx(subset(df_combined, publication == "MSKCC")[,-c(1, 11)], path = '../GBM paper/supplementary data/supp_table4_MSKCC_dataset.xlsx') write_xlsx(subset(df_combined, publication == "TCGA")[,-c(1, 11)], path = '../GBM paper/supplementary data/supp_table4_TCGA_dataset.xlsx') write_xlsx(subset(df_combined, publication == "Praxis")[,-c(1, 11)], path = '../GBM paper/supplementary data/supp_table4_treatment_dataset.xlsx') # nearest neighbor matching, requiring the MGMT to match exactly between every patient pairs. set.seed(3) m.out <- matchit(group ~ age_at_diagnosis + sex + MGMT + received_tmz + concurrent_tmz, data = df_combined, method = "nearest", distance = "glm", discard = "both", m.order = "random", exact = "MGMT") # extract the data for the plot quality_df <- data.frame(m.out$treat, m.out$distance, m.out$discarded, m.out$weights) quality_df$group <- "matched_treated" quality_df$group[which(quality_df$m.out.weights == 1 & quality_df$m.out.treat == 0)] <- "matched_control" quality_df$group[which(quality_df$m.out.weights == 0 & quality_df$m.out.treat == 0)] <- "unmatched_control" quality_df$group[which(quality_df$m.out.weights == 0 & quality_df$m.out.treat == 1)] <- "unmatched_treated" quality_df$score <- quality_df$m.out.distance write_xlsx(subset(quality_df, select = c(group, score)), path = '../GBM paper/supplementary data/supp_figure4_sub1.xlsx') write_xlsx(subset(quality_df, m.out.treat == 1, select = c(group, score)), path = '../GBM paper/supplementary data/supp_figure4_sub2_raw_treated.xlsx') write_xlsx(subset(quality_df, m.out.treat == 1 & m.out.weights == 1, select = c(group, score)), path = '../GBM paper/supplementary data/supp_figure4_sub2_matched_treated.xlsx') write_xlsx(subset(quality_df, m.out.treat == 0, select = c(group, score)), path = '../GBM paper/supplementary data/supp_figure4_sub2_raw_control.xlsx') write_xlsx(subset(quality_df, m.out.treat == 0 & m.out.weights == 1, select = c(group, score)), path = '../GBM paper/supplementary data/supp_figure4_sub2_matched_control.xlsx') balance_unmatched <- data.frame(summary(m.out)$sum.all[, 1:3]) balance_unmatched <- cbind(row.names(balance_unmatched), balance_unmatched) names(balance_unmatched)[1] <- "variables" write_xlsx(balance_unmatched, path = '../GBM paper/supplementary data/supp_figure4_sub3_unmatched_variable_balance.xlsx') balance_matched <- data.frame(summary(m.out)$sum.matched[, 1:3]) balance_matched <- cbind(row.names(balance_matched), balance_matched) names(balance_matched)[1] <- "variables" write_xlsx(balance_matched, path = '../GBM paper/supplementary data/supp_figure4_sub3_matched_variable_balance.xlsx') # summary(m.out) # visualizations to assess the balance of matching plot(m.out, type = "jitter", interactive = FALSE, col = "blue") plot(m.out, type = "histogram", interactive = FALSE) # plot(m.out, type = "ecdf", interactive = T) # plot(m.out, type = "density", interactive = T) # plot(m.out, type = "qq", interactive = T) plot(summary(m.out), xlim = c(0, 1)) # obtain matched data for K-M analysis matched_data <- match.data(m.out) matched_data$patient_group <- c("Public", "Praxis")[matched_data$group + 1] matched_data$patient_group <- factor(matched_data$patient_group, levels = c("Praxis", "Public")) table(matched_data$publication) # write.csv(matched_data, file = "../results/matched_datasets_TTV.csv", row.names = F, quote = F) write_xlsx(matched_data[,-c(1, 11:14)], path = "../GBM paper/supplementary data/supp_table5_matched_datasets_control_and_treatment.xlsx") # compare survival between our data and matched data gbm_fit <- survfit(Surv(survival_months, vital_status == "Dead") ~ patient_group, data = matched_data) matching_pval <- surv_pvalue(gbm_fit)$pval.txt matching_pval figure_matching <- ggsurvplot(gbm_fit, data=as.data.frame(matched_data), conf.int=F, pval=matching_pval, censor.shape="|", risk.table=T, surv.median.line="hv", xlab="Overall survival from diagnosis* [months]", risk.table.y.text=F, tables.height = .20, break.time.by=6, palette=c("#FF0000","#000000"), legend.title=element_blank(), legend="bottom", legend.labs=c("our data","publication data"), tables.y.text="", tables.theme = theme_cleantable(base_size=12, base_family = "Arial"), ggtheme = theme_classic2(base_size=12, base_family = "Arial"), font.family = "Arial" ) figure_matching + ggtitle("Survival comparison of public and Praxis GBM patients") png("../results/matching_ttv_filter.png", width = 540, height = 400) print(figure_matching) dev.off() # model coefficient for calculating propensity scores # # df_coef <- data.frame(variables = names(coef(m.out$model)), coefficients = as.numeric(coef(m.out$model))) # write_xlsx(df_coef, path = "../results/logistic_regression_coefficients.xlsx") # multivariate analysis multivariate_cox_table <- function(tempfit){ temp <- summary(tempfit) results <- data.frame(cbind(temp$coefficients[,1], temp$coefficients[,5], temp$conf.int[,1], temp$conf.int[,3], temp$conf.int[,4])) names(results) <- c("coefficient", "p-value", "hazard_ratio", "hazard_ratio_lower", "hazard_ratio_upper") tempzph <- cox.zph(tempfit) pvalue_ph_test <- tempzph$table[-nrow(tempzph$table),3] results$pvalue_ph_test <- as.numeric(pvalue_ph_test) results <- data.frame(sapply(results, function(x) as.numeric(x))) results$variable <- row.names(temp$coefficients) results } matched_data$patient_group <- factor(matched_data$patient_group, levels = c("Public", "Praxis")) coxfit <- coxph(Surv(survival_months, vital_status == "Dead") ~ age_at_diagnosis + sex + MGMT + received_tmz + concurrent_tmz + patient_group, data = matched_data) cox_table <- multivariate_cox_table(coxfit) write_xlsx(cox_table, path = "../results/cox_multivariate_TTV.xlsx") write_xlsx(cox_table, path = "../GBM paper/supplementary data/supp_table5_multivariate_cox_results.xlsx")