--- title: "Knee MR results" author: "Robyn L. Ball, PhD" date: '2018-06-01' output: pdf_document: default html_document: df_print: paged --- Results from the MR experiment. 7 general radiologists + 2 orthopedic sugeons Read in data ```{r} indir <- "../data/" odir <- "../results/" fn <- paste0(indir,"2018-05-30_knee-mr-all-labels22.csv") data <- read.csv(fn, head=T) fn <- paste0(indir,"predictions-precise.csv") mdf <- read.csv(fn,head=F) colnames(mdf) <- c("abnormal","acl","meniscus") ``` ```{r} labels <- c("abnormal","acl","meniscus") group1 <- paste0("gen",1:4) # unassisted first df <- data ``` compute exact Fleiss kappa for MSKs ```{r} library(irr) is.msk <- which(grepl("msk\\d.",colnames(df))) nlab <- length(labels) fleiss <- rep(NA, nlab) names(fleiss) <- labels for (i in 1:nlab) { is.label <- which(grepl(labels[i],colnames(df))) is.set <- intersect(is.label,is.msk) tt <- df[,is.set] fleiss[i] <- kappam.fleiss(tt, exact = TRUE)$value } round(fleiss,3) ``` get stacked df to calculate overall radiologist macro-averages and get stacked df for overall clinical expert macro-averages (rads + surgeons) sdf.a = assisted rad; sdf.c = assisted clinical experts sdf.ua = unassisted rad; sdf.uac = unassisted clinical experts struth = stacked MSK truth for rads; struth.c = stacked truth for clinical experts ```{r} is.gen <- which(grepl("gen",colnames(df))) # rads is.sur <- which(grepl("sur",colnames(df))) # surgeons is.clin <- union(is.gen,is.sur) # clinical experts is.ass <- which(grepl("assisted",colnames(df))) ngens <- 7 nsur <- 2 sdf.ua <- sdf.a <- array(NA, dim=c(ngens*nrow(df), nlab), dimnames = list(NULL,labels)) sdf.uac <- sdf.c <- array(NA, dim=c((ngens+nsur)*nrow(df), nlab), dimnames = list(NULL,labels)) for (i in 1:nlab) { # rads is.lab <- which(grepl(labels[i],colnames(df))) is.set2 <- intersect(intersect(is.gen,is.lab),is.ass) is.set1 <- setdiff(intersect(is.gen,is.lab),is.set2) ss1 <- df[,is.set1[1]] ss2 <- df[,is.set2[1]] for (j in 2:length(is.set1)) { ss1 <- c(ss1,df[,is.set1[j]]) ss2 <- c(ss2,df[,is.set2[j]]) } sdf.ua[,i] <- ss1 sdf.a[,i] <- ss2 # clinical experts is.set2 <- intersect(intersect(is.clin,is.lab),is.ass) is.set1 <- setdiff(intersect(is.clin,is.lab),is.set2) ss1 <- df[,is.set1[1]] ss2 <- df[,is.set2[1]] for (j in 2:length(is.set1)) { ss1 <- c(ss1,df[,is.set1[j]]) ss2 <- c(ss2,df[,is.set2[j]]) } sdf.uac[,i] <- ss1 sdf.c[,i] <- ss2 } struth <- data.frame(abnormal=rep(df$msk_abnormal,7), acl=rep(df$msk_acl,7), meniscus=rep(df$msk_meniscus,7)) struth.c <- data.frame(abnormal=rep(df$msk_abnormal,9), acl=rep(df$msk_acl,9), meniscus=rep(df$msk_meniscus,9)) ``` Set up functions for sensitivity, specificity, accuracy ```{r} .tp <- function(gt,prediction) { length(which(gt==1 & prediction==1)) } .fp <- function(gt,prediction) { length(which(gt==0 & prediction==1)) } .fn <- function(gt,prediction) { length(which(gt==1 & prediction==0)) } .tn <- function(gt,prediction) { length(which(gt==0 & prediction==0)) } ``` make available to use with bootstrap by using indices ```{r} require(binom) # dd is a dataframe with columns gt & prediction get.measures <- function(dd, indices) { TP <- .tp(gt = dd$gt[indices], prediction=dd$prediction[indices]) TN <- .tn(gt = dd$gt[indices], prediction=dd$prediction[indices]) FP <- .fp(gt = dd$gt[indices], prediction=dd$prediction[indices]) FN <- .fn(gt = dd$gt[indices], prediction=dd$prediction[indices]) #sens <- TP/(TP + FN) sens <- binom.confint(x=TP,n=TP+FN, methods = "wilson")[4:6] #spec <- TN/(TN + FP) spec <- binom.confint(x=TN,n=TN+FP, methods = "wilson")[4:6] #acc <- (TP + TN)/nrow(dd) acc <- binom.confint(x=TP+TN,n=nrow(dd), methods = "wilson")[4:6] unlist(c(spec=spec, sens=sens, acc=acc)) } ``` is.gen.a = columns for assisted radiologists is.gen.ua = columns for unassisted radiologists is.clin.a = columns for assisted clinical experts (rads & surgeons) is.clin.ua = columns for unassisted clinical experts (rads & surgeons) ```{r} is.sur <- which(grepl("sur",colnames(df))) is.gen.a <- intersect(is.gen,is.ass) is.gen.ua <- setdiff(is.gen,is.gen.a) is.clin.a <- intersect(is.clin,is.ass) is.clin.ua <- setdiff(is.clin,is.clin.a) ``` model thresholds -- use >= 0.5 calculate performance on test set measures.a = performance with model assistance measures.ua = performance unassisted measures.diff = measures.a - measures.ua ```{r} nmeas <- 3 nrads <- 7 nsur <- 2 rnames <-c(paste0("gen",1:nrads),paste0("sur",1:nsur), "Radiologists macro-average","Clinical experts macro-average","Model") cnames <- c("Specificity", "Specificity Lower", "Specificity Upper", "Sensitivity", "Sensitivity Lower", "Sensitivity Upper", "Accuracy", "Accuracy Lower", "Accuracy Upper") measures.a <- measures.ua <- measures.diff <- array(NA, dim=c(nrads+nsur+3,nmeas*3,nlab), dimnames = list(rnames,cnames,labels)) for (i in 1:nlab) { is.lab <- which(grepl(labels[i],colnames(df))) gt <- df[,paste0("msk_",labels[i])] for (j in 1:nrads) { is.rad <- which(grepl(paste0("gen",j),colnames(df))) is.seta <- intersect(is.ass,intersect(is.lab,is.rad)) is.setua <- setdiff(intersect(is.lab,is.rad),is.seta) dd <- data.frame(gt=gt,prediction=df[,is.setua]) measures.ua[j,,i] <- get.measures(dd,indices=1:nrow(dd)) dd$prediction <- df[,is.seta] measures.a[j,,i] <- get.measures(dd,indices=1:nrow(dd)) measures.diff[j,,i] <- measures.a[j,,i] - measures.ua[j,,i] } for (kk in 1:2) { is.sur <- which(grepl(paste0("sur",kk),colnames(df))) is.seta <- intersect(is.ass,intersect(is.lab,is.sur)) is.setua <- setdiff(intersect(is.lab,is.sur),is.seta) dd <- data.frame(gt=gt,prediction=df[,is.setua]) measures.ua[j+kk,,i] <- get.measures(dd,indices=1:nrow(dd)) dd$prediction <- df[,is.seta] measures.a[j+kk,,i] <- get.measures(dd,indices=1:nrow(dd)) measures.diff[j+kk,,i] <- measures.a[j+kk,,i] - measures.ua[j+kk,,i] } # rad macro-averages dd <- data.frame(gt=struth[,i],prediction=sdf.ua[,i]) measures.ua[j+kk+1,,i] <- get.measures(dd,indices=1:nrow(dd)) dd$prediction <- sdf.a[,i] measures.a[j+kk+1,,i] <- get.measures(dd,indices=1:nrow(dd)) measures.diff[j+kk+1,,i] <- measures.a[j+kk+1,,i] - measures.ua[j+kk+1,,i] # clinical experts macro-averages dd <- data.frame(gt=struth.c[,i],prediction=sdf.uac[,i]) measures.ua[j+kk+2,,i] <- get.measures(dd,indices=1:nrow(dd)) dd$prediction <- sdf.c[,i] measures.a[j+kk+2,,i] <- get.measures(dd,indices=1:nrow(dd)) measures.diff[j+kk+2,,i] <- measures.a[j+kk+2,,i] - measures.ua[j+kk+2,,i] #model dd <- data.frame(gt,prediction=as.numeric(mdf[,i] >= 0.5)) measures.ua[j+kk+3,,i] <- get.measures(dd,indices=1:nrow(dd)) } ``` write pretty ```{r} dd2 <- measures.a[-nrow(measures.a),,] sname <- "assisted" kk1 <- seq(1,9,3) kk2 <- seq(2,9,3) kk3 <- seq(3,9,3) mm <- array("", dim=c(nrow(dd2),ncol=nmeas), dimnames = list(rownames(dd2), c("Specificity","Sensitivity","Accuracy"))) dd2 <- formatC( round(dd2,3), format="f", digits=3) for (i in 1:nlab) { for (j in 1:nmeas) { mm[,j] <- paste0(dd2[,kk1[j],i]," (", dd2[,kk2[j],i],", ",dd2[,kk3[j],i],")") fn <- paste0(odir,Sys.Date(),"_",sname,"-",labels[i],"-pretty.txt") write.table(mm,fn,sep=",") } } dd2 <- measures.ua sname <- "unassisted" mm <- array("", dim=c(nrow(dd2),ncol=nmeas), dimnames = list(rownames(dd2), c("Specificity","Sensitivity","Accuracy"))) dd2 <- formatC( round(dd2,3), format="f", digits=3) for (i in 1:nlab) { for (j in 1:nmeas) { mm[,j] <- paste0(dd2[,kk1[j],i]," (", dd2[,kk2[j],i],", ",dd2[,kk3[j],i],")") fn <- paste0(odir,Sys.Date(),"_",sname,"-",labels[i],"-pretty.txt") write.table(mm,fn,sep=",") } } dd2 <- measures.diff sname <- "difference" mm <- array("", dim=c(nrow(dd2),ncol=nmeas), dimnames = list(rownames(dd2), c("Specificity","Sensitivity","Accuracy"))) dd2 <- formatC( round(dd2,3), format="f", digits=3) for (i in 1:nlab) { for (j in 1:nmeas) { mm[,j] <- paste0(dd2[,kk1[j],i]," (", dd2[,kk2[j],i],", ",dd2[,kk3[j],i],")") fn <- paste0(odir,Sys.Date(),"_",sname,"-",labels[i],"-pretty.txt") write.table(mm,fn,sep=",") } } fn <- paste0(odir,Sys.Date(),"_differences.csv") write.csv(round(measures.diff[1:9,kk1,],3),fn) ``` calculate one-sided one sample t-test on the differences H0: diff <= 0 and Ha: diff > 0 ```{r} vv <- measures.diff[,kk1,] pval <- pval.rads <- array(NA, c(3,3), dimnames = list(labels,cnames[kk1])) for (i in 1:nlab) { for (j in 1:3) { d <- vv[1:(nrads+nsur),j,i] pval[i,j] <- t.test(d, alternative = "greater")$p.value d <- vv[1:(nrads),j,i] pval.rads[i,j] <- t.test(d, alternative = "greater")$p.value } } # round(pval,3) ``` Use Pearson's chi-squared prop.test to compare radiologist macro-average performance to the model ```{r} prop.p <- array(NA, dim=c(nlab,3), dimnames=list(labels,colnames(pval))) for (i in 1:nlab) { dd <- data.frame(gt=struth[,i],prediction=sdf.ua[,i]) ddm <- data.frame(gt=df[,paste0("msk_",labels[i])], prediction=as.numeric(mdf[,i]>=0.5)) TP <- .tp(gt = dd$gt, prediction=dd$prediction) TN <- .tn(gt = dd$gt, prediction=dd$prediction) FP <- .fp(gt = dd$gt, prediction=dd$prediction) FN <- .fn(gt = dd$gt, prediction=dd$prediction) TPm <- .tp(gt = ddm$gt, prediction=ddm$prediction) TNm <- .tn(gt = ddm$gt, prediction=ddm$prediction) FPm <- .fp(gt = ddm$gt, prediction=ddm$prediction) FNm <- .fn(gt = ddm$gt, prediction=ddm$prediction) # prop tests x <- c(TN,TNm); n <- c(TN+FP, TNm+FPm) # spec prop.p[i,1] <- prop.test(x,n, correct = T)$p.value x <- c(TP,TPm); n <- c(TP+FN, TPm+FNm) # sens prop.p[i,2] <- prop.test(x,n, correct = T)$p.value x <- c(TP+TN,TPm+TNm); n <- c(nrow(dd), nrow(ddm)) # acc prop.p[i,3] <- prop.test(x,n, correct = T)$p.value } # round(prop.p,3) ``` Control for multiple testing - Adjust for FDR = 0.05 ```{r} # source("http://bioconductor.org/biocLite.R") # biocLite("qvalue") library(qvalue) p <- c(pval,pval.rads,prop.p) qq <- qvalue(p, fdr.level = 0.05, lambda=0) pqnames <- c("Specificity p-value","Specificity q-value", "Sensitivity p-value","Sensitivity q-value", "Accuracy p-value", "Accuracy q-value") pq.diff <- pq.diff.rads <- array(NA, dim=c(3,6), dimnames=list(labels,pqnames)) pq.diff[,seq(1,6,2)] <- pval pq.diff[,seq(2,6,2)] <- qq$qvalues[1:9] pq.diff <- formatC( round(pq.diff,3), format="f", digits=3) # write p & q values for clinical utility fn <- paste0(odir,Sys.Date(),"_p&q-values-for-difference-all.txt") write.table(pq.diff,fn,sep=",") # sensitivity analysis - rads only pq.diff.rads[,seq(1,6,2)] <- pval.rads pq.diff.rads[,seq(2,6,2)] <- qq$qvalues[10:18] pq.diff.rads <- formatC( round(pq.diff.rads,3), format="f", digits=3) # write p & q values for clinical utility fn <- paste0(odir,Sys.Date(),"_p&q-values-for-difference-rads.txt") write.table(pq.diff.rads,fn,sep=",") pq.diff # write p & q values for model vs radiologists performance pq.prop <- array(NA, dim=c(3,6), dimnames=list(labels,pqnames)) pq.prop[,seq(1,6,2)] <- prop.p pq.prop[,seq(2,6,2)] <- qq$qvalues[19:27] round(pq.prop,3) pq.prop <- formatC( round(pq.prop,3), format="f", digits=3) fn <- paste0(odir,Sys.Date(),"_p&q-values-for-model-comparison.txt") write.table(pq.prop,fn,sep=",") ``` Calculate AUCs and 95% CIs for the model ```{r} auc <- array(NA, dim=c(nlab,3), dimnames = list(labels,c("AUC","AUC Lower","AUC Upper"))) for (i in 1:nlab) { gt <- df[,paste0("msk_",labels[i])] rr <- pROC::roc(response=gt,predictor=mdf[,labels[i]]) auc[i,1] <- rr$auc[1] auc[i,2:3] <- pROC::ci.auc(rr,method="delong")[c(1,3)] } round(auc,3) auc <- formatC( round(auc,3), format="f", digits=3) out <- auc out[,1] <- paste0(out[,1]," (",out[,2],", ",out[,3],")") fn <- paste0(odir,Sys.Date(),"_AUC.txt") write.table(out[,1],fn) ```