###################################################################### ########## Worked example (Section 7 of main paper) ########## ###################################################################### # Load required packages library("pROC"); library("sn");library("MASS"); require(pracma) #Required functions invlogit <- function(x){exp(x)/(1+exp(x))} ###################################################################### # Precision-based sample size calculation # Input parameters # Required standard error of the C-statistic, calibration in # the large and calibration slope to be 0.025, 0.15 and 0.15, # respectively varc <- 0.025^2 varcs <- 0.15^2 varcl <- 0.15^2 #Anticipated population values for C and p p <- 0.057 c <- 0.77 ###################################################################### # C-statistic - Approximation ############################## N <- (c-2*T.Owen(-qnorm(c),1/sqrt(3))-c^2)/((p-p^2 )*varc) N <- ceiling(N) events <- ceiling(N*p) N_c_app <-N events_c_app <-events N_c_app; events_c_app ############################################################################## # C-statistic - Numerical Integration (Assuming marginal Normality) ################################# # For the numerical integration method a distribution for the linear predictor # needs to be assumed. In this calculation we assume marginal normality for # the linear predictor #f1 and f2 are functions that contain the integrad in equation (5) and (6) #as given by Gail and Pfeiffer (2005) f1<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2)) * (1+exp(-x))^-1 f0<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))* (1-(1+exp(-x))^-1) # The following functions are used to calculate P(eta_00.00001) d11 <- subset(d1, p_eta1<(1-0.00001) & p_eta1>0.00001) a<-d01$x[1] b<-d11$x[nrow(d11)] c(a,b) ## Actual numerical integration starts here step<-(0.0001) x<-seq(a,b,step) p_eta0<-NULL p_eta1<-NULL #Numerical integration to get P(eta_01E05 # p,c = anticipated values for C statistic, outcome prevalence # fc = inflation factor to fine tune sigma^2 for high values of # anticipated to give the target. Suggested values: # fc=1.0 for C<0.7, # fc=1.00-1.01 for C=0.70 to 0.80 # fc=1.01-1.03 for C=0.80 to 0.85 # fc=1.04-1.06 for C=85 to 0.90 #See details for fc in Supplementary Material 1 # Output # The true value of p and C that correspond to the values of values # of and sigma^2 given in equations (7) and (8) meas_true<- function(nevents,p,c,fc=1) { n<-nevents/p sigma_c <- sqrt(2)*qnorm(c)*fc mu<-0.5*(2*p-1)*(sigma_c^2)+log(p/(1-p)) sigma<-sqrt((sigma_c^2)*(1+p*(1-p)*(sigma_c^2))) eta <- rnorm(n,mu,sigma) p_est <- invlogit(eta) y <- rbinom(n,1,p_est) n1<-sum(y) n0<-n-n1 cstat <- roc(y,eta,quiet=TRUE,ci=FALSE) c_est <- as.vector(cstat$auc) out<-c(mean(y),c_est,mu,sigma) round(out,3) } # Example to check true values for high C using these equations # For very high c the value of fc can be adapted to match more # closely the require true value of C # and extract the corresponding values of mu and sigma to # be used in the sample size equations # True value of C and p based on equations (7) and (8) true <- meas_true(300000,p=0.1,c=0.85,fc=1); c_actual<-true[2]; p_actual <-true[1] c_actual; p_actual # Thes values of mu=-3.07 and sigma=1.601 correspond to a # slightly smaller c (0.843) than the required # anticipated. # Fine tuning with the factor fc=1.03 gives the value of actual c # closer to the one required. true <- meas_true(300000,p=0.1,c=0.85,fc=1.03) c_actual<-true[2]; p_actual <-true[1] mu<-true[3]; sigma<-true[4] c_actual; p_actual; mu;sigma # So, instead of mu=-3.07 and sigma=1.601 as given by (7) and (8) # mu=-3.109 and sigma=1.657 can be used instead for the methods that require # an input value for mu and sigma. # However, the effect on the sample size calculations will be negligible # unless the required C is 0.9 or higher, so equations (7) and (8) will be # generally appropriate. ################################################################################ #R code for the DGM1 #The linear predictor is conditionally Normally distributed given Y with the corresponding variances being equal. Data are generated from a LDA model under Assumption 2. library("readstata13");library("pROC");library("robustbase");library("scoring") library("rms");library("EnvStats");library("safeBinaryRegression");library("MASS") library("Matrix");library("speedglm");library("BaylorEdPsych");library("sn") library("pracma");require(ggplot2);require(gridExtra);library("openxlsx") #Compare the empirical SE and approx. SE of performance measures invlogit <- function(x){exp(x)/(1+exp(x))} variancetest<- function(nevents,p,truec,truecs) { sigma <- sqrt(2)*qnorm(c) n1 <- nevents n0 <- round(round(nevents/p)-nevents) n<-n0+n1 mu1 <- (sigma^2)/2 + log(n1/n0) mu0 <- mu1-sigma^2 eta0 <- rnorm(n0,mu0,sigma) eta1 <- rnorm(n1,mu1,sigma) eta <- c(eta0,eta1) pa_val <- invlogit(eta) y <- c(rep(0,n0), rep(1,n1)) cc <- roc(y,eta,quiet=TRUE,ci=TRUE) c <- as.vector(cc$auc) css <- speedglm(y ~ eta, family=binomial(link='logit')) cs <- as.numeric(coef(css))[2] out<-c(round(p,2),truec,truecs,c,cs) out } Nsim <- 10000 res<-NULL for (p in c(0.05,0.1,0.3)){ for (c in c(0.64,0.72,0.8,0.85,0.9)){ print(c(p,c)) for (nevents in c(50,100,200,400)){ a<-matrix(NA,nrow=Nsim, ncol=5) for (i in 1:Nsim){ set.seed(i) a[i,]<-variancetest(nevents,p,c,1) } var_emp<- apply(a,2,var,na.rm=TRUE)[4:5] n<-nevents/p var_emp_c=var_emp[1] var_emp_cs=var_emp[2] A <- 2*p*(1-p)*qnorm(c)^2 var_app_cs <-1/(A*n)+2/(n-2) var_app_c=((c-2*T.Owen(-qnorm(c),1/sqrt(3)))-c^2)/(n*p-n*p^2) a_sum <- c(p,c,nevents,sqrt(var_emp_c), sqrt(var_app_c),sqrt(var_app_c)/sqrt(var_emp_c), sqrt(var_emp_cs), sqrt(var_app_cs),sqrt(var_app_cs)/sqrt(var_emp_cs) ) res<-rbind(res, a_sum) }}} res_se <- res res_se<-data.frame(res_se) colnames(res_se) <- c("p","c","n_events","se_emp_c","se_app_c","se_app_c/se_emp_c","se_emp_cs","se_app_cs","se_app_cs/se_emp_cs") View(res_se) #R code for DGM3 invlogit <- function(x){exp(x)/(1+exp(x))} ################################################################################################ ############compare empirical SE vs Approx. SE of performance measures########################## ################################################################################################ meas_true<- function(nevents,p,c,fc) { n <- nevents/p sigmain <- sqrt(2)*qnorm(c)*fc mu <-0.5*(2*p-1)*(sigmain^2)+log(p/(1-p)) sigma <-sqrt((sigmain^2)*(1+p*(1-p)*(sigmain^2))) eta <- rnorm(n,mu,sigma) pi <- invlogit(eta) y <- rbinom(n,1,pi) n1 <- sum(y) n0 <- n-n1 cstat <- roc(y,eta,quiet=TRUE,ci=FALSE) c_est <- as.vector(cstat$auc) sigma0<-sd(eta[y==0]); sigma1<-sd(eta[y==1]) sigma0<-round(sigma0,2) sigma1<-round(sigma1,2) out<-c(mean(y),c_est,mu,sigma,round(sigma0,2),round(sigma1,2)) round(out,3) } meas<- function(nevents,p,c,fc) { n<-nevents/p sigmain <- sqrt(2)*qnorm(c)*fc mu <-0.5*(2*p-1)*(sigmain^2)+log(p/(1-p)) sigma <-sqrt((sigmain^2)*(1+p*(1-p)*(sigmain^2))) eta <- rnorm(n,mu,sigma) pi <- invlogit(eta) y <- rbinom(n,1,pi) cstat <- roc(y,eta,quiet=TRUE,ci=FALSE) c <- as.vector(cstat$auc) cs_mod <- speedglm(y ~ eta, family=binomial(link='logit')) cs <- as.numeric(coef(cs_mod))[2] cl_mod <- speedglm(y ~ offset(eta), family=binomial(link='logit')) cl <- as.numeric(coef(cl_mod))[1] out<-c(mean(y),c,cs,cl) out } Nsim <-10000 res<-NULL for (p in c(0.05,0.1,0.3)){ for (c in c(0.64, 0.72, 0.8, 0.85, 0.9)){ fc=1 if (c==0.8) fc=1.01 if (p<0.3 & c==0.85) fc=1.03 if (p<0.3 & c==0.9) fc=1.05 if (p==0.3 & c==0.85) fc=1.02 if (p==0.3 & c==0.9) fc=1.04 for (nevents in c(50, 100, 200,400)){ print(c(p,c,nevents)) n<-nevents/p a<-replicate(Nsim, meas(nevents,p=p,c=c,fc=fc) ) a<-t(a) true <- meas_true(300000,p,c,fc=fc) p_true <- true[1]; c_true <- true[2] mu_true <- true[3];sigma_true<- true[4] var_emp<- apply(a,2,var,na.rm=TRUE)[2:4] var_emp_c=var_emp[1] #empirical SE of C var_emp_cs=var_emp[2] #empirical SE of CS var_emp_cl=var_emp[3] #empirical SE of CSL #approx. SE of C-statistics using NI mu <-mu_true sigma<-sigma_true g<-NULL; k<-NULL; x<-NULL f1<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2)) * (1+exp(-x))^-1 f0<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))* (1-(1+exp(-x))^-1) x<-seq(-12,7,0.0005) p_eta0<-NULL p_eta1<-NULL for (i in 1: length(x) ) { p_eta1[i]<-integrate(f1, -Inf, x[i], mu=mu, s=sigma)$value/integrate(f1, -Inf, Inf, mu=mu, s=sigma)$value p_eta0[i]<-integrate(f0, -Inf, x[i], mu=mu, s=sigma)$value/integrate(f0, -Inf, Inf, mu=mu, s=sigma)$value #print(i) } p_eta0[1]=0; p_eta0[length(p_eta0)]=1 p_eta1[1]=0; p_eta1[length(p_eta1)]=1 d0<-cbind(x,p_eta0) View(d0) d1<-cbind(x,p_eta1) View(d1) u<-sort(runif(50000)) eta0<-interp1(x=p_eta0,y=x,xi=u) hist(eta0) u<-sort(runif(50000)) eta1<-interp1(x=p_eta1,y=x,xi=u) hist(eta1) #Prob eta0tol & i<10){ #set.seed(i) if (abs(d1)>abs(d2) & sign(d1)!=sign(d2)) n1<-(n1+n2)/2 if (abs(d1)n2) {n1<-n1+5; n2<-n2-5} if (sign(d1)==sign(d2) & n10.00001) d11 <- subset(d1, p_eta1<(1-0.00001) & p_eta1>0.00001) a<-d01$x[1] b<-d11$x[nrow(d11)] step<-(0.00005) x<-seq(a,b,step) p_eta0<-NULL p_eta1<-NULL for (i in 1: length(x) ) { p_eta1[i]<-integrate(f1, -Inf, x[i], mu=mu,subdivisions = 10000L, s=sigma)$value/integrate(f1, -Inf, Inf, subdivisions = 10000L,mu=mu, s=sigma)$value p_eta0[i]<-integrate(f0, -Inf, x[i], mu=mu,subdivisions = 10000L, s=sigma)$value/integrate(f0, -Inf, Inf, subdivisions = 10000L,mu=mu, s=sigma)$value #print(i) } p_eta0[1]=0 ; p_eta1[1]=0 p_eta0[length(x)]=1 ; p_eta1[length(x)]=1 #### u<-sort(runif(1000000)) eta0<-interp1(x=p_eta0,y=x,xi=u) #hist(eta0) u<-sort(runif(1000000)) eta1<-interp1(x=p_eta1,y=x,xi=u) #hist(eta1) intf0<-function(upper) { prob<-NULL for (i in 1:length(upper)){ prob[i]<-integrate(f0, -Inf, upper=upper[i], mu=mu, s=sigma)$value/integrate(f0, -Inf, Inf, mu=mu, s=sigma)$value } prob} intf1<-function(upper) { prob<-NULL for (i in 1:length(upper)){ prob[i]<-integrate(f1, -Inf, upper=upper[i], mu=mu, s=sigma)$value/integrate(f1, -Inf, Inf, mu=mu, s=sigma)$value } prob} prob<-intf0(eta1) E_K2<-mean(prob^2);E_K2 prob<-intf1(eta0) E_G2<-mean((1-prob)^2); c_ni <- 1-mean(prob) p_ni <- integrate(f1, -Inf, Inf, mu=mu, s=sigma)$value event_req_ni <- ((1-p_ni)*E_K2+p_ni*E_G2-c_ni^2) /(se_true^2*p_ni*(1-p_ni))*p_ni events_req_app <- ceiling((c_ni-2*T.Owen(-qnorm(c_ni),1/sqrt(3))-c_ni^2)/((p_ni-p_ni^2)*se_true^2)*p_ni) a_sum <- c(p,c,se_true, events_req, events_req_app, events_req_app/events_req, event_req_ni ,event_req_ni/events_req) req_event_c <-rbind(req_event_c, a_sum) print(c) }}}} req_event_c <-data.frame(req_event_c) colnames(req_event_c) <- c("p","c","se_true", "events_req", "events_req_app", "events_req_app/events_req", "events_req_ni", "events_req_ni/events_req") ########################################################################################################## #approx. No of events vs true No of events required to achieve the SE of Calibration slope = 0.05,0.10,0.15 cal<- function(nevents,p,c) { fc=1 if (c==0.8) fc=1.01 if (p<0.3 & c==0.85) fc=1.03 if (p<0.3 & c==0.9) fc=1.05 if (p==0.3 & c==0.85) fc=1.02 if (p==0.3 & c==0.9) fc=1.04 n <- nevents/p sigmain <- sqrt(2)*qnorm(c)*fc mu <- 0.5*(2*p-1)*(sigmain^2)+log(p/(1-p)) sigma <- sqrt((sigmain^2)*(1+p*(1-p)*(sigmain^2))) eta <- rnorm(n,mu,sigma) prob <- invlogit(eta) y <- rbinom(n,1,prob) cs <- coef(speedglm(y ~ eta, family=binomial(link='logit')))[2] cs } find_events_req_cs <- function(se_true,p,c,nstart1,nstart2,nstart_sim, tol=0.001){ #initial values n1<-nstart1 n2<-nstart2 nsim<-nstart_sim #set.seed(0) se1<-sd(replicate(nsim, cal(n1,p,c) )) se2<-sd(replicate(nsim, cal(n2,p,c) )) se1;se2 d1<-(se1-se_true) d2<-(se2-se_true) d1;d2 i<-1 if (abs(d1)tol){ #set.seed(i) if (abs(d1)>abs(d2) & sign(d1)!=sign(d2)) n1<-(n1+n2)/2 if (abs(d1)n2) {n1<-n1+5; n2<-n2-5} if (sign(d1)==sign(d2) & n1 0.001) { if (d1 > d2) { n1 <- ceiling((n1+n2)/2) se1 <- sd(replicate(15000,cs_in_l(n1,p,c))) d1 <- abs(se_true -se1) d <- d1 events_req <- n1 se_emp <- se1 } else{ n2 <- ceiling((n1+n2)/2) se2 <- sd(replicate(15000,cs_in_l(n2,p,c))) d2 <- abs(se_true -se2) d <- d2 events_req <- n2 se_emp <- se2 } print(i) i = i+1 events_req se_emp } a_sum <- c(p,c,se_true, events_req, events_req_app,events_req_app_ni,events_req_app/events_req,events_req_app_ni/events_req ,se_emp) req_event_csl<-data.frame(rbind(req_event_csl, a_sum)) print(c) }}} colnames(req_event_csl) <- c("p","c","se_true", "events_req", "events_req_app", "events_req_app_ni", "events_req_app/events_req", "events_req_app_ni/events_req", "se_emp_cs", "se_app") ######################################################################################################################################################################################### ########################################################Power and type 1 error########################################################################################################### ######################################################################################################################################################################################### power_c <- function(p0=for_c0[1],p1=for_c1[1],c0=for_c0[2],c1=for_c1[2],mu=for_c1[3], sigma=for_c1[4],alpha=0.05,beta=0.1) { #alpha = type I error, significance level #beta = type-II error, 1-beta=power #H0: c=c0 #H1: c=c1 d=c1-c0 sd_c0=sqrt((c0-2*T.Owen(-qnorm(c0),1/sqrt(3))-c0^2) /(p0-p0^2)) sd_c1=sqrt((c1-2*T.Owen(-qnorm(c1),1/sqrt(3))-c1^2) /(p1-p1^2)) #Formula for ss calculation n <- ceiling(((qnorm(1-alpha)*sd_c0 + qnorm(1-beta)*sd_c1))^2/d^2) ;n #Generate data under the alternative hypothesis eta <- rnorm(n,mu,sigma) y <- rbinom(n,1,invlogit(eta)) c <- roc(y,eta,quiet=TRUE,ci=TRUE) c_est <- as.vector(c$auc) se_c <- (c$ci[3]-c_est)/qnorm(0.975) c_est+qnorm(1-alpha) *se_c cov_c <- ifelse( (c_est-qnorm(1-alpha) *se_c) >= c0 , 1, 0) n1<-ceiling(n*p0) out<-c(mean(y),c0,c1,c_est,n1,cov_c) out } type1_error_c<- function(p0=for_c0[1],p1=for_c1[1],c0=for_c0[2],c1=for_c1[2],mu=for_c0[3], sigma=for_c0[4],alpha=0.05,beta=0.1) { #alpha = type I error, significance level #beta = type-II error, 1-beta=power #H0: c=c0 #H1: c=c1 d=c1-c0 sd_c0=sqrt((c0-2*T.Owen(-qnorm(c0),1/sqrt(3))-c0^2) /(p0-p0^2)) sd_c1=sqrt((c1-2*T.Owen(-qnorm(c1),1/sqrt(3))-c1^2) /(p1-p1^2)) n <- ceiling(((qnorm(1-alpha)*sd_c0 + qnorm(1-beta)*sd_c1))^2/d^2) ;n n1<-ceiling(n*p0) #Generate data under the NULL hypothesis eta <- rnorm(n,mu,sigma) y <- rbinom(n,1,invlogit(eta)) c <- roc(y,eta,quiet=TRUE,ci=TRUE) c_est <- as.vector(c$auc) se_c <- (c$ci[3]-c_est)/qnorm(0.975) cov_c <- ifelse( (c_est+qnorm(1-alpha) *se_c) <= c0 , 1, 0) out <- c(mean(y),c0,c1,c_est,n1,cov_c) out } ################# Nsim <- 100000 res<-NULL for (dc in c(0.03, 0.05)){ for (p in c(0.05,0.1,0.3)){ for (c in c(0.64, 0.72, 0.8, 0.85)){ print(c(p,c)) for_c0<- meas_true(2000000,p,c,fc=1) for_c1<- meas_true(2000000,p,c+dc,fc=1) a<-replicate(Nsim, type1_error_c(p0=for_c0[1],p1=for_c1[1],c0=for_c0[2],c1=for_c1[2],mu=for_c0[3], sigma=for_c0[4],alpha=0.05,beta=0.1) ); a<-t(a) type1_error_n<-round(colMeans(a),3);type1_error_n b<-replicate(Nsim, power_c(p0=for_c0[1],p1=for_c1[1],c0=for_c0[2],c1=for_c1[2],mu=for_c1[3], sigma=for_c1[4],alpha=0.05,beta=0.1) ); b<-t(b) power_n<-round(colMeans(b),3);power_n a_sum <- c(dc,p,c,c+dc,type1_error_n[5:6],power_n[6]) res<-rbind(res, a_sum) }}} res_power <- res res_power<-data.frame(res_power) colnames(res_power) <- c("Difference","p","c0","c1","nevents", "alpha","power") View(res_power)