######################################################################

##########  Worked example  (Section 7 of main paper)       ##########

######################################################################

# Load required packages

library("pROC"); library("sn");library("MASS"); require(pracma)


#Required functions

invlogit    <- function(x){exp(x)/(1+exp(x))}

######################################################################


# Precision-based sample size calculation

# Input parameters 


# Required standard error of the C-statistic, calibration in 
# the large and calibration slope to be 0.025, 0.15 and 0.15,
# respectively

varc   <- 0.025^2
varcs  <- 0.15^2
varcl  <- 0.15^2

#Anticipated population values for C and p

p <- 0.057
c <- 0.77

######################################################################

# C-statistic - Approximation 

##############################

N            <- (c-2*T.Owen(-qnorm(c),1/sqrt(3))-c^2)/((p-p^2 )*varc) 
N            <- ceiling(N) 
events       <- ceiling(N*p)
N_c_app      <-N
events_c_app <-events
N_c_app; events_c_app

##############################################################################

# C-statistic - Numerical Integration (Assuming marginal Normality)

#################################

# For the numerical integration method a distribution for the linear predictor 
# needs to be assumed. In this calculation we assume marginal normality for
# the linear predictor

#f1 and f2 are functions that contain the integrad in equation (5) and (6)
#as given by Gail and Pfeiffer (2005)
f1<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2)) * (1+exp(-x))^-1
f0<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))* (1-(1+exp(-x))^-1)

# The following functions are used to calculate P(eta_0<eta_1) and 
# P(eta_1<eta_0)

intf0<-function(upper) {
  prob<-NULL
  for (i in 1:length(upper)){
    prob[i]<-integrate(f0, -Inf, upper=upper[i], mu=mu, s=sigma)$value/integrate(f0, -Inf, Inf, mu=mu, s=sigma)$value
  }
  prob}

intf1<-function(upper) {
  prob<-NULL
  for (i in 1:length(upper)){
    prob[i]<-integrate(f1, -Inf, upper=upper[i], mu=mu, s=sigma)$value/integrate(f1, -Inf, Inf, mu=mu, s=sigma)$value
  }
  prob}

# Input values for mu and sigma assuming marginal normality
# Factor fc can be used to fine-tuned this when C is high
# See small section of code after the worked example below

fc=1.00
sigma_c <- sqrt(2)*qnorm(c)*fc
mu<-0.5*(2*p-1)*(sigma_c^2)+log(p/(1-p))
sigma<-sqrt((sigma_c^2)*(1+p*(1-p)*(sigma_c^2)))

#We now get the cumulative distribution of eta_0 and eta1
# A vector of values to cover the range of possible values of eta_0 
#and eta_0

#Pre-processing to narrow down the integration range

x<-seq(-14,14,0.01)
length(x)

p_eta0<-NULL ; p_eta1<-NULL

#Numerical integration to get P(eta_0<x) and P(eta_1<x)
#Equations for Gail and Pfeiffer (2005)

for (i in 1: length(x) ) {
  p_eta1[i]<-integrate(f1, -Inf, x[i], mu=mu,subdivisions = 1000L, s=sigma)$value/integrate(f1, -Inf, Inf, subdivisions = 1000L,mu=mu, s=sigma)$value
  p_eta0[i]<-integrate(f0, -Inf, x[i], mu=mu,subdivisions = 1000L, s=sigma)$value/integrate(f0, -Inf, Inf, subdivisions = 1000L,mu=mu, s=sigma)$value
  #print(i)
}

d0<-data.frame(cbind(x,p_eta0))
d1<-data.frame(cbind(x,p_eta1))

View(d1)


d01 <- subset(d0, p_eta0<(1-0.00001) & p_eta0>0.00001)
d11 <- subset(d1, p_eta1<(1-0.00001) & p_eta1>0.00001)

a<-d01$x[1]
b<-d11$x[nrow(d11)]

c(a,b)

## Actual numerical integration starts here

step<-(0.0001)

x<-seq(a,b,step)

p_eta0<-NULL
p_eta1<-NULL

#Numerical integration to get P(eta_0<x) and P(eta_1<x)
#Equations for Gail and Pfeiffer

for (i in 1: length(x) ) {
  p_eta1[i]<-integrate(f1, -Inf, x[i], mu=mu, s=sigma)$value/integrate(f1, -Inf, Inf, mu=mu, s=sigma)$value
  p_eta0[i]<-integrate(f0, -Inf, x[i], mu=mu, s=sigma)$value/integrate(f0, -Inf, Inf, mu=mu, s=sigma)$value
  #print(i)
}

p_eta0[1]=0 ; p_eta1[1]=0 
p_eta0[length(x)]=1 ; p_eta1[length(x)]=1 

# Inverse CDF to sample from the distribution of eta_0 and eta_1
u<-sort(runif(1000000))
eta0<-interp1(x=p_eta0,y=x,xi=u)
hist(eta0)

u<-sort(runif(1000000))
eta1<-interp1(x=p_eta1,y=x,xi=u)
hist(eta1) # Distribution of eta_0 and eta_1 is approximately normal, as expected


# Compute EK=E(P(eta_0<eta_1))
prob<-NULL
prob<-intf0(eta1)
E_K2<-mean(prob^2)

#Calculate  the true C and p for these values of mu and sigma
c_ni      <- mean(prob)
p_ni      <- integrate(f1, -Inf, Inf, mu=mu, s=sigma)$value
c_true    <- c_ni 
p_true    <- p_ni

# Compute EG=E(P(eta_1<eta_0))
prob<-NULL
prob<-intf1(eta0)
E_G2<-mean((1-prob)^2);


#Enter in the formula
N              <- (1/varc)*((1-p_true)*E_K2+p_true*E_G2-c_true^2) /(p_true*(1-p_true))
N              <- ceiling(N) 
events         <- ceiling(N*p)
N_c_ni         <-N
events_c_ni    <-events
N_c_ni; events_c_ni


##############################################################################

### Calibration slope - Approximation 

#################################################

A        <- 2*p*(1-p)*qnorm(c)^2
N        <- 1/varcs * (1/A+2)
N        <- ceiling(N)
events   <- ceiling(N*p)
N_cs_app <- N; events_cs_app<-events


##############################################################################

### Calibration slope - Numerical integration

#################################################

fc=1.00
sigma_c <- sqrt(2)*qnorm(c)*fc
mu<-0.5*(2*p-1)*(sigma_c^2)+log(p/(1-p))
sigma<-sqrt((sigma_c^2)*(1+p*(1-p)*(sigma_c^2)))


f_omega<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)
f_omega_eta<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)*x
f_omega_etasq<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)*x^2

E_omega=integrate(f_omega, -Inf, Inf, mu=mu, s=sigma)$value
E_omega_eta_sq <-integrate(f_omega_etasq, -Inf, Inf, mu=mu, s=sigma)$value
E_omega_eta<-integrate(f_omega_eta, -Inf, Inf, mu=mu, s=sigma)$value

numer        <- E_omega
denom        <- E_omega *E_omega_eta_sq-E_omega_eta^2
N            <- 1/varcs * (numer/denom)
N            <- ceiling(N)
events       <- ceiling(N*p)
N_cs_ni      <- N; 
events_cs_ni <- events

N_cs_ni; N_cs_app
events_cs_ni;events_cs_app

##############################################################################

### Calibration in the large - Approximation 

#################################################

fc=1.00
sigma_c <- sqrt(2)*qnorm(c)*fc
mu<-0.5*(2*p-1)*(sigma_c^2)+log(p/(1-p))
sigma<-sqrt((sigma_c^2)*(1+p*(1-p)*(sigma_c^2)))

# Calibration in the large - Approximation (Formula 16 of main paper)

p_tilde       <-  exp(mu)/(1+exp(mu))
term1m        <-  p_tilde*(1-p_tilde)
term2m        <-  (1/2)*(1-6*p_tilde+6*p_tilde^2)*p_tilde*(1-p_tilde)*sigma^2
N             <-  1/(varcl*(term1m+term2m))
N             <-  ceiling(N)
events        <-  ceiling(N*p)
N_cl_app      <-  N; 
events_cl_app <-  events


##############################################################################

### Calibration in the large - numerical integration 

####################################################

N            <- 1/varcs * (1/E_omega)
N            <- ceiling(N)
events       <- ceiling(N*p)
N_cl_ni      <- N; 
events_cl_ni <- events

N_cl_ni; N_cl_app
events_cl_ni;events_cl_app

###########################################################################

### Summarise results for the worked example

####################################################

ss <- c(N_c_app, N_c_ni, N_cs_app, N_cs_ni, N_cl_app, N_cl_ni)
ev <- c(events_c_app, events_c_ni, events_cs_app, events_cs_ni, events_cl_app, events_cl_ni)

res            <- as.data.frame(rbind(ss,ev))
names(res)     <- c("C-stat(app)", "C-stat(ni)", "Cal. Slope (app)", 
                    "Cal. Slope (ni)",  "Cal. large (app)", "Cal. large (ni)")
row.names(res) <- c("Sample Size", "Events")
View(res)

####################################################

# Functions to compute the actual C-statistic and outcome prevalence for mu 
# and sigma^2 given in equations (7) and (8) of main paper
# under the assumption of marginal normality

# Arguments

# nevents= number of events. nevents>1E05
# p,c    = anticipated values for C statistic, outcome prevalence
# fc     = inflation factor to fine tune sigma^2 for high values of
#          anticipated to give the target. Suggested values: 
# fc=1.0 for C<0.7, 
# fc=1.00-1.01 for C=0.70 to 0.80 
# fc=1.01-1.03 for C=0.80 to 0.85 
# fc=1.04-1.06 for C=85 to 0.90 
#See details for fc in Supplementary Material 1

# Output 
# The true value of p and C that correspond to the values of  values
# of and sigma^2 given in equations (7) and (8)

meas_true<- function(nevents,p,c,fc=1) {
  n<-nevents/p
  sigma_c <- sqrt(2)*qnorm(c)*fc
  
  mu<-0.5*(2*p-1)*(sigma_c^2)+log(p/(1-p))
  sigma<-sqrt((sigma_c^2)*(1+p*(1-p)*(sigma_c^2)))
  
  eta <- rnorm(n,mu,sigma) 
  p_est <- invlogit(eta) 
  y <- rbinom(n,1,p_est)
  
  n1<-sum(y)
  n0<-n-n1
  cstat <- roc(y,eta,quiet=TRUE,ci=FALSE)
  c_est <- as.vector(cstat$auc)
  
  out<-c(mean(y),c_est,mu,sigma)
  round(out,3)
}

# Example to check true values for high C using these equations
# For very high c the value of fc can be adapted to match more
# closely the require true value of C 
# and extract the corresponding values of mu and sigma to
# be used in the sample size equations

# True value of C and p based on equations (7) and (8)

true <- meas_true(300000,p=0.1,c=0.85,fc=1); 
c_actual<-true[2]; p_actual <-true[1]
c_actual; p_actual

# Thes values of mu=-3.07 and sigma=1.601 correspond to a 
# slightly smaller c (0.843) than the required 
# anticipated. 

# Fine tuning with the factor fc=1.03 gives the value of actual c
#  closer to the one required.

true <- meas_true(300000,p=0.1,c=0.85,fc=1.03)
c_actual<-true[2]; p_actual <-true[1]
mu<-true[3]; sigma<-true[4]
c_actual; p_actual; mu;sigma


# So, instead of mu=-3.07  and sigma=1.601 as given by (7)  and (8)
# mu=-3.109 and sigma=1.657 can be used instead for the methods that require 
# an input value for mu and sigma. 
# However, the effect on the sample size calculations will be negligible
# unless  the required C is 0.9 or higher, so equations (7) and (8) will be
# generally appropriate.


################################################################################


#R code for the DGM1
#The linear predictor is conditionally Normally distributed given Y with the corresponding variances being equal. Data are generated from a LDA model under Assumption 2. 

library("readstata13");library("pROC");library("robustbase");library("scoring")
library("rms");library("EnvStats");library("safeBinaryRegression");library("MASS")
library("Matrix");library("speedglm");library("BaylorEdPsych");library("sn")
library("pracma");require(ggplot2);require(gridExtra);library("openxlsx")

#Compare the empirical SE and approx. SE of performance measures 

invlogit    <- function(x){exp(x)/(1+exp(x))}

variancetest<- function(nevents,p,truec,truecs) {


  sigma <- sqrt(2)*qnorm(c)
  
  n1 <- nevents
  n0 <- round(round(nevents/p)-nevents)
  n<-n0+n1

  mu1 <- (sigma^2)/2 + log(n1/n0)
  mu0 <- mu1-sigma^2
  
  eta0 <- rnorm(n0,mu0,sigma) 
  eta1 <- rnorm(n1,mu1,sigma) 
  
  eta    <- c(eta0,eta1)
  pa_val <- invlogit(eta) 
  y  <- c(rep(0,n0), rep(1,n1))

  cc <- roc(y,eta,quiet=TRUE,ci=TRUE)
  c <- as.vector(cc$auc)
  
  css    <- speedglm(y ~ eta, family=binomial(link='logit')) 
  cs  <- as.numeric(coef(css))[2]
  

  out<-c(round(p,2),truec,truecs,c,cs)
  out
  
}


Nsim <- 10000

res<-NULL


for (p in c(0.05,0.1,0.3)){
  
  for (c in c(0.64,0.72,0.8,0.85,0.9)){
    
    print(c(p,c))
    
    for (nevents in c(50,100,200,400)){
      
a<-matrix(NA,nrow=Nsim, ncol=5)

for (i in 1:Nsim){
  
set.seed(i)  
  
a[i,]<-variancetest(nevents,p,c,1)
  
}

var_emp<- apply(a,2,var,na.rm=TRUE)[4:5]
n<-nevents/p
var_emp_c=var_emp[1]
var_emp_cs=var_emp[2]
A <- 2*p*(1-p)*qnorm(c)^2
var_app_cs <-1/(A*n)+2/(n-2)   
var_app_c=((c-2*T.Owen(-qnorm(c),1/sqrt(3)))-c^2)/(n*p-n*p^2) 

a_sum <- c(p,c,nevents,sqrt(var_emp_c), sqrt(var_app_c),sqrt(var_app_c)/sqrt(var_emp_c),  
           sqrt(var_emp_cs), sqrt(var_app_cs),sqrt(var_app_cs)/sqrt(var_emp_cs) )

res<-rbind(res, a_sum)
  }}}

res_se <- res

res_se<-data.frame(res_se)

colnames(res_se) <- c("p","c","n_events","se_emp_c","se_app_c","se_app_c/se_emp_c","se_emp_cs","se_app_cs","se_app_cs/se_emp_cs")
View(res_se)


#R code for DGM3
invlogit    <- function(x){exp(x)/(1+exp(x))}

################################################################################################
############compare empirical SE vs Approx. SE of performance measures##########################
################################################################################################

meas_true<- function(nevents,p,c,fc) {
  n       <- nevents/p
  sigmain <- sqrt(2)*qnorm(c)*fc
  
  mu      <-0.5*(2*p-1)*(sigmain^2)+log(p/(1-p))
  sigma   <-sqrt((sigmain^2)*(1+p*(1-p)*(sigmain^2)))
  
  eta     <- rnorm(n,mu,sigma) 
  pi      <- invlogit(eta) 
  y       <- rbinom(n,1,pi)
  
  n1      <- sum(y)
  n0      <- n-n1
  cstat <- roc(y,eta,quiet=TRUE,ci=FALSE)
  c_est <- as.vector(cstat$auc)
  sigma0<-sd(eta[y==0]); sigma1<-sd(eta[y==1])
  
  sigma0<-round(sigma0,2)
  sigma1<-round(sigma1,2)
  out<-c(mean(y),c_est,mu,sigma,round(sigma0,2),round(sigma1,2))
  round(out,3)
}


meas<- function(nevents,p,c,fc) {
  
  n<-nevents/p
  
  sigmain <- sqrt(2)*qnorm(c)*fc
   mu     <-0.5*(2*p-1)*(sigmain^2)+log(p/(1-p))
  sigma   <-sqrt((sigmain^2)*(1+p*(1-p)*(sigmain^2)))
  
  
  eta <- rnorm(n,mu,sigma) 
  pi <- invlogit(eta) 
  y  <- rbinom(n,1,pi)

  
  cstat  <- roc(y,eta,quiet=TRUE,ci=FALSE)
  c      <- as.vector(cstat$auc)
  cs_mod <- speedglm(y ~ eta, family=binomial(link='logit')) 
  cs     <- as.numeric(coef(cs_mod))[2]
  cl_mod <- speedglm(y ~ offset(eta), family=binomial(link='logit')) 
  cl     <- as.numeric(coef(cl_mod))[1]
  out<-c(mean(y),c,cs,cl)
  out
  
}


Nsim <-10000

res<-NULL

for (p in c(0.05,0.1,0.3)){
  for (c in c(0.64, 0.72, 0.8, 0.85, 0.9)){

    fc=1
    if (c==0.8)            fc=1.01
    if (p<0.3  & c==0.85)  fc=1.03
    if (p<0.3  & c==0.9)   fc=1.05
    if (p==0.3 & c==0.85)  fc=1.02
    if (p==0.3 & c==0.9)   fc=1.04
    
    for (nevents in c(50, 100, 200,400)){
      
    print(c(p,c,nevents))

    n<-nevents/p
    a<-replicate(Nsim, meas(nevents,p=p,c=c,fc=fc) )
    a<-t(a)
    

true   <- meas_true(300000,p,c,fc=fc)
p_true <- true[1]; c_true <- true[2]
mu_true <- true[3];sigma_true<- true[4]

var_emp<- apply(a,2,var,na.rm=TRUE)[2:4]

var_emp_c=var_emp[1]    #empirical SE of C
var_emp_cs=var_emp[2]   #empirical SE of CS
var_emp_cl=var_emp[3]   #empirical SE of CSL


#approx. SE of C-statistics using NI

mu   <-mu_true
sigma<-sigma_true


g<-NULL; k<-NULL; x<-NULL

f1<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2)) * (1+exp(-x))^-1

f0<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))* (1-(1+exp(-x))^-1)


x<-seq(-12,7,0.0005)

p_eta0<-NULL
p_eta1<-NULL


for (i in 1: length(x) ) {
  p_eta1[i]<-integrate(f1, -Inf, x[i], mu=mu, s=sigma)$value/integrate(f1, -Inf, Inf, mu=mu, s=sigma)$value
  p_eta0[i]<-integrate(f0, -Inf, x[i], mu=mu, s=sigma)$value/integrate(f0, -Inf, Inf, mu=mu, s=sigma)$value
  #print(i)
}

p_eta0[1]=0; p_eta0[length(p_eta0)]=1
p_eta1[1]=0; p_eta1[length(p_eta1)]=1

d0<-cbind(x,p_eta0)
View(d0)

d1<-cbind(x,p_eta1)
View(d1)

u<-sort(runif(50000))
eta0<-interp1(x=p_eta0,y=x,xi=u)
hist(eta0)

u<-sort(runif(50000))
eta1<-interp1(x=p_eta1,y=x,xi=u)
hist(eta1)


#Prob eta0<eta1, then expectation (mean)
prob<-replicate(30000,integrate(f0, -Inf, sample(eta1,1), mu=mu, s=sigma)$value/integrate(f0, -Inf, Inf, mu=mu, s=sigma)$value)
E_K2<-mean(prob^2)
#Prob eta1<eta0, then expectation (mean)
prob<-replicate(30000,integrate(f1, -Inf, sample(eta0,1), mu=mu, s=sigma)$value/integrate(f1, -Inf, Inf, mu=mu, s=sigma)$value)
E_G2<-mean((1-prob)^2)

var_c_ni  <- ((1-p_true)*E_K2+p_true*E_G2-c_true^2) /(n*p_true*(1-p_true))


#approx. SE of C-statistics using equation (9)
var_app_c   <- ((c_true-2*T.Owen(-qnorm(c_true),1/sqrt(3)))-c_true^2)/(n*p_true-n*p_true^2) 

#approx. SE of calibration slope using equation (16)
A           <- 2*p_true*(1-p_true)*qnorm(c_true)^2
var_app_cs  <-1/(A*n)+2/(n-2)   


#numerical integration

f_omega<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)

f_omega_eta<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)*x

f_omega_etasq<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)*x^2

mean_omega=integrate(f_omega, -Inf, Inf, mu=mu, s=sigma)$value
mean_omega_eta_sq <-integrate(f_omega_etasq, -Inf, Inf, mu=mu, s=sigma)$value
mean_omega_eta<-integrate(f_omega_eta, -Inf, Inf, mu=mu, s=sigma)$value

#approx. SE of calibration slope using equation (13)
numer       <-n*mean_omega
denom       <-n* mean_omega * n*mean_omega_eta_sq  - (n^2)*mean_omega_eta^2
var_app_cs2 <- numer/denom


#approx. SE of Calibration in the large using equation (14)
pin    <-exp(mu_est)/(1+exp(mu_est))
term1m <- n* pin*(1-pin)
term2m <- n* (1/2)*(1-6*pin+6*pin^2)*pin*(1-pin)*sigma_est^2
var_app_cl <- 1/ (term1m+term2m)


#approx. SE of Calibration in the large using equation (12)
var_app_cl2 <- 1/(n*mean_omega)


a_sum <- c(round(c(p_true,c_true),3),nevents,
           sqrt(var_emp_c), sqrt(var_app_c),(sqrt(var_app_c)/sqrt(var_emp_c)-1)*100,  
           sqrt(var_c_ni),(sqrt(var_c_ni)/sqrt(var_emp_c)-1)*100, 
           sqrt(var_emp_cs), sqrt(var_app_cs),(sqrt(var_app_cs)/sqrt(var_emp_cs)-1)*100,
           sqrt(var_app_cs2),(sqrt(var_app_cs2)/sqrt(var_emp_cs)-1)*100,
           sqrt(var_emp_cl), sqrt(var_app_cl),(sqrt(var_app_cl)/sqrt(var_emp_cl)-1)*100,
           sqrt(var_app_cl2),(sqrt(var_app_cl2)/sqrt(var_emp_cl)-1)*100,
           true[5],true[6])
res<-rbind(res, a_sum)
    }}}
res_se<-res

colnames(res_se) <- c("p","c","n_events","se_emp_c","se_app_c","Bias_se_app","se_ni_c","Bias_se_ni",
                      "se_emp_cs","se_app_cs","Bias_se_app_cs",
                      "se_app_cs2","Bias_se_app_cs2",
                      "se_emp_cl","se_app_cl","Bias_se_app_cl",
                      "se_app_cl2","Bias_se_app_cl2","sigma0","sigma1")



#################################################################################################
###########################events required to achieve target SE##################################
#################################################################################################

# approx. No of events vs true No of events required to achieve the SE of C-statistic = 0.0125,0.025,0.05 
cstat<- function(nevent,p,mu,sigma) {
  n     <- nevent/p
  eta   <- rnorm(n,mu,sigma) 
  p_est <- invlogit(eta) 
  y     <- rbinom(n,1,p_est)
  
  cstat <- pROC::roc(y,eta,quiet=TRUE,ci=FALSE)
  c_est <- as.vector(cstat$auc)
}


find_events_req_c <- function(se_true,p,mu,sigma,nstart1,nstart2,nstart_sim, tol=0.00001){
  
  #initial values
  n1<-nstart1
  n2<-nstart2
  nsim<-nstart_sim
  
  #set.seed(0)
  
  se1<-sd(replicate(nsim, cstat(n1,p,mu,sigma)))
  se2<-sd(replicate(nsim, cstat(n2,p,mu,sigma)))
  se1;se2
  
  d1<-(se1-se_true)
  d2<-(se2-se_true)
  d1;d2
  
  i<-1
  
  if (abs(d1)<abs(d2)) out<-c(n1,se1) else out<-c(n2,se2)
  
  miniter<-5
  
  while (min(abs(d1),abs(d2))>tol & i<10){
    
    #set.seed(i)
    
    if (abs(d1)>abs(d2) & sign(d1)!=sign(d2)) n1<-(n1+n2)/2 
    if (abs(d1)<abs(d2) & sign(d1)!=sign(d2)) n2<-(n1+n2)/2
    if (sign(d1)==sign(d2) & n1>n2)  {n1<-n1+5; n2<-n2-5}
    if (sign(d1)==sign(d2) & n1<n2)  {n1<-n1-5; n2<-n2+5}
    
    nsimnew<-nsim
    
    se1<-sd(replicate(nsimnew, cstat(n1,p,mu,sigma) ))
    se2<-sd(replicate(nsimnew, cstat(n2,p,mu,sigma) ))
    se1;se2
    d1=se1-se_true
    d2<-se2-se_true
    i<-i+1
    print(i-1)
    if (abs(d1)<abs(d2)) out<-c(n1,se1) else out<-c(n2,se2)
  }
  
  out
}


req_event_c <-NULL

for (se_true in c(0.0125,0.025,0.05)){
    for (p in c(0.05,0.1,0.3)){
        for (c in c(0.64,0.72,0.8,0.85,0.9)){
          
          fc=1
          if (c==0.8)            fc=1.01
          if (p<0.3  & c==0.85)  fc=1.03
          if (p<0.3  & c==0.9)   fc=1.05
          if (p==0.3 & c==0.85)  fc=1.02
          if (p==0.3 & c==0.9)   fc=1.04
          
          print(c(se_true,p,c))
          
          true   <- meas_true(400000,p,c,fc=fc)
          p_true <- true[1]; c_true <- true[2]
          mu_true<-true[3];sigma_true=true[4]
          
          
          
       events_start <- ceiling((c-2*T.Owen(-qnorm(c),1/sqrt(3))-c^2)/((p-p^2)*se_true^2)*p) 
      
      
      if (c==0.64)   {nstart1 <- events_start*0.97 ; nstart2<-events_start*1.05}  
      if (c==0.72)   {nstart1 <- events_start*0.97 ; nstart2<-events_start*1.05}
      if (c==0.8)    {nstart1 <- events_start*0.97 ; nstart2<-events_start*1.05}
      if (c==0.85)   {nstart1 <- events_start*0.80 ; nstart2<-events_start*1.05}
      if (c==0.9)    {nstart1 <- events_start*0.7 ;  nstart2<-events_start*1.5}
      
      
      nstart1<-ceiling(nstart1)
      nstart2<-ceiling(nstart2)
      
      error  <- NULL
      
      a <- tryCatch({find_events_req_c(se_true,p,c,nstart1,nstart2,15000,tol=0.0001) 
      }, error=function(e){cat("error:",conditionMessage(e), "\n")})  
      if(inherits(a,"NULL")){
        error          <- error + 1
      }
      else{
        events_req <- ceiling(a[1])
        se_emp     <- a[2]
        nn         <-   events_req/p

        
   
        #####  Numerical Integration  for C ####
        mu    <- mu_true
        sigma <- sigma_true
        
        f1<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2)) * (1+exp(-x))^-1
        f0<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))* (1-(1+exp(-x))^-1)
        
        x<-seq(-14,14,0.01)
        length(x)
        
        p_eta0<-NULL
        p_eta1<-NULL
        
        
        for (i in 1: length(x) ) {
          p_eta1[i]<-integrate(f1, -Inf, x[i], mu=mu,subdivisions = 1000L, s=sigma)$value/integrate(f1, -Inf, Inf, subdivisions = 1000L,mu=mu, s=sigma)$value
          p_eta0[i]<-integrate(f0, -Inf, x[i], mu=mu,subdivisions = 1000L, s=sigma)$value/integrate(f0, -Inf, Inf, subdivisions = 1000L,mu=mu, s=sigma)$value
          #print(i)
        }
        
        d0<-data.frame(cbind(x,p_eta0))
        d1<-data.frame(cbind(x,p_eta1))
        
        d01 <- subset(d0, p_eta0<(1-0.00001) & p_eta0>0.00001)
        d11 <- subset(d1, p_eta1<(1-0.00001) & p_eta1>0.00001)
        
        a<-d01$x[1]
        b<-d11$x[nrow(d11)]

        
        
        step<-(0.00005)
        
        x<-seq(a,b,step)
        
        p_eta0<-NULL
        p_eta1<-NULL
        
        
        for (i in 1: length(x) ) {
          p_eta1[i]<-integrate(f1, -Inf, x[i], mu=mu,subdivisions = 10000L, s=sigma)$value/integrate(f1, -Inf, Inf, subdivisions = 10000L,mu=mu, s=sigma)$value
          p_eta0[i]<-integrate(f0, -Inf, x[i], mu=mu,subdivisions = 10000L, s=sigma)$value/integrate(f0, -Inf, Inf, subdivisions = 10000L,mu=mu, s=sigma)$value
          #print(i)
        }
        
        p_eta0[1]=0 ; p_eta1[1]=0 
        p_eta0[length(x)]=1 ; p_eta1[length(x)]=1 
        
        ####
        
        u<-sort(runif(1000000))
        eta0<-interp1(x=p_eta0,y=x,xi=u)
        #hist(eta0)
        
        u<-sort(runif(1000000))
        eta1<-interp1(x=p_eta1,y=x,xi=u)
        #hist(eta1)
        
        
        intf0<-function(upper) {
          prob<-NULL
          for (i in 1:length(upper)){
            prob[i]<-integrate(f0, -Inf, upper=upper[i], mu=mu, s=sigma)$value/integrate(f0, -Inf, Inf, mu=mu, s=sigma)$value
          }
          prob}
        
        
        intf1<-function(upper) {
          prob<-NULL
          for (i in 1:length(upper)){
            prob[i]<-integrate(f1, -Inf, upper=upper[i], mu=mu, s=sigma)$value/integrate(f1, -Inf, Inf, mu=mu, s=sigma)$value
          }
          prob}
        
        prob<-intf0(eta1)
        E_K2<-mean(prob^2);E_K2
        prob<-intf1(eta0)
        E_G2<-mean((1-prob)^2); 
        c_ni <- 1-mean(prob)
        p_ni <- integrate(f1, -Inf, Inf, mu=mu, s=sigma)$value
        event_req_ni    <- ((1-p_ni)*E_K2+p_ni*E_G2-c_ni^2) /(se_true^2*p_ni*(1-p_ni))*p_ni
        
        
        events_req_app <- ceiling((c_ni-2*T.Owen(-qnorm(c_ni),1/sqrt(3))-c_ni^2)/((p_ni-p_ni^2)*se_true^2)*p_ni) 
        
        a_sum <- c(p,c,se_true, events_req, events_req_app, events_req_app/events_req, event_req_ni ,event_req_ni/events_req)
        
        req_event_c <-rbind(req_event_c, a_sum)
        print(c)
      }}}}


req_event_c <-data.frame(req_event_c) 

colnames(req_event_c) <- c("p","c","se_true", "events_req", "events_req_app", "events_req_app/events_req", "events_req_ni", "events_req_ni/events_req")

##########################################################################################################
#approx. No of events vs true No of events required to achieve the SE of Calibration slope = 0.05,0.10,0.15 

cal<- function(nevents,p,c) {
  fc=1
  if (c==0.8)            fc=1.01
  if (p<0.3  & c==0.85)  fc=1.03
  if (p<0.3  & c==0.9)   fc=1.05
  if (p==0.3 & c==0.85)  fc=1.02
  if (p==0.3 & c==0.9)   fc=1.04
  
  n       <- nevents/p
  sigmain <- sqrt(2)*qnorm(c)*fc
  
  mu      <- 0.5*(2*p-1)*(sigmain^2)+log(p/(1-p))
  sigma   <- sqrt((sigmain^2)*(1+p*(1-p)*(sigmain^2)))
  
  eta     <- rnorm(n,mu,sigma) 
  prob    <- invlogit(eta) 
  y       <- rbinom(n,1,prob)
  cs      <- coef(speedglm(y ~ eta, family=binomial(link='logit')))[2]
  cs
}



find_events_req_cs <- function(se_true,p,c,nstart1,nstart2,nstart_sim, tol=0.001){
  
  #initial values
  n1<-nstart1
  n2<-nstart2
  nsim<-nstart_sim
  
  #set.seed(0)
  
  se1<-sd(replicate(nsim, cal(n1,p,c) ))
  se2<-sd(replicate(nsim, cal(n2,p,c) ))
  se1;se2
  
  d1<-(se1-se_true)
  d2<-(se2-se_true)
  d1;d2
  
  i<-1
  
  if (abs(d1)<abs(d2)) out<-c(n1,se1) else out<-c(n2,se2)
  
  miniter<-5
  
  while (min(abs(d1),abs(d2))>tol){
    
    #set.seed(i)
    if (abs(d1)>abs(d2) & sign(d1)!=sign(d2)) n1<-(n1+n2)/2 
    if (abs(d1)<abs(d2) & sign(d1)!=sign(d2)) n2<-(n1+n2)/2
    if (sign(d1)==sign(d2) & n1>n2)  {n1<-n1+5; n2<-n2-5}
    if (sign(d1)==sign(d2) & n1<n2)  {n1<-n1-5; n2<-n2+5}
    
    nsimnew<-nsim
    se1<-sd(replicate(nsimnew, cal(n1,p,c)))
    se2<-sd(replicate(nsimnew, cal(n2,p,c)))
    se1;se2
    d1=se1-se_true
    d2<-se2-se_true
    i<-i+1
    print(i-1)
    if (abs(d1)<abs(d2)) out<-c(n1,se1) else out<-c(n2,se2)
  }
  out
}


req_event_cs<-NULL
for (se_true in c(0.05,0.1,0.15)){
  for (p in c(0.05, 0.1,0.3)){
    for (c in c(0.64,0.72,0.8,0.85,0.9)){
      
      #No of events required to achieve target SE using equation (18) 
      A <- 2*p*(1-p)*qnorm(c)^2
      events_req_app <- ceiling(1/se_true^2 * (1/A+2)*p)
      
      #No of events required to achieve target SE using equation (13) 
      sigmain           <- sqrt(2)*qnorm(c)
      mu                <- 0.5*(2*p-1)*(sigmain^2)+log(p/(1-p))
      sigma             <- sqrt((sigmain^2)*(1+p*(1-p)*(sigmain^2)))
      
      f_omega<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)
      
      f_omega_eta<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)*x
      
      f_omega_etasq<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)*x^2
      
      mean_omega=integrate(f_omega, -Inf, Inf, mu=mu, s=sigma)$value
      mean_omega_eta_sq <-integrate(f_omega_etasq, -Inf, Inf, mu=mu, s=sigma)$value
      mean_omega_eta    <-integrate(f_omega_eta, -Inf, Inf, mu=mu, s=sigma)$value
      numer             <-n*mean_omega
      denom             <-n* mean_omega * n*mean_omega_eta_sq  - (n^2)*mean_omega_eta^2
      events_req_app_ni <- rbind(req_event_cs_mc,(1/se_true^2)*(numer/denom)*p)
    
      #find the true No of events to achieve target SE  
      if (c==0.64)   {nstart1 <- events_req_app*1.00 ; nstart2<-events_req_app*1.05}  
      if (c==0.72)   {nstart1 <- events_req_app*1.02 ; nstart2<-events_req_app*1.1}
      if (c==0.8)    {nstart1 <- events_req_app*1.1  ; nstart2<-events_req_app*1.25}
      if (c==0.85)   {nstart1 <- events_req_app*1.1  ; nstart2<-events_req_app*1.35}
      if (c==0.9)    {nstart1 <- events_req_app*1.2  ; nstart2<-events_req_app*1.45}
      
      nstart1<-ceiling(nstart1)
      nstart2<-ceiling(nstart2)
      
      a          <- find_events_req_cs(se_true,p,c,nstart1,nstart2,15000, tol=0.001)
      events_req <- ceiling(a[1])
      se_emp<-a[2]
      
      #result
      a_sum <- c(p,c,se_true, events_req, events_req_app,events_req_app_ni, events_req_app/events_req, events_req_app_ni/events_req, se_emp)
      
      req_event_cs<-rbind(req_even_cs, a_sum)
      print(c)
    }}}

req_event_cs           <-data.frame(req_event_cs)
colnames(req_event_cs) <- c("p","c","se_true", "events_req", "events_req_app","events_req_app_ni","events_req_app/events_req","events_req_app_ni/events_req","se_emp_cs", "se_app")



#################################################################################################################
# approx. No of events vs true No of events required to achieve the SE of calibration in the large= 0.05, 0.1, 0.15 

cs_in_l<- function(nevents,p,c){
  n       <- nevents/p
  sigmain <- sqrt(2)*qnorm(c)
  
  mu     <- 0.5*(2*p-1)*(sigmain^2)+log(p/(1-p))
  sigma  <- sqrt((sigmain^2)*(1+p*(1-p)*(sigmain^2)))
  
  eta    <- rnorm(n,mu,sigma) 
  prob   <- invlogit(eta) 
  y      <- rbinom(n,1,prob)
  csl    <- coef(speedglm(y ~ offset(eta), family=binomial(link='logit')))[1]
  csl
}


req_event_csl<-NULL

for (se_true in c(0.05, 0.1, 0.15)){
  
  for (p in c( 0.1,0.3)){
    
    for (c in c(0.85,0.9)){
      
      
      sigmain<- sqrt(2)*qnorm(c)
      mu     <- 0.5*(2*p-1)*(sigmain^2)+log(p/(1-p))
      sigma  <- sqrt((sigmain^2)*(1+p*(1-p)*(sigmain^2)))
      
      #No of events required to achieve target SE using equation (19)
      pin    <-  exp(mu)/(1+exp(mu))
      term1m <-  pin*(1-pin)
      term2m <-  (1/2)*(1-6*pin+6*pin^2)*pin*(1-pin)*sigma^2
      events_req_app <-  ceiling(1/ (se_true^2*(term1m+term2m))*p)
      
      #No of events required to achieve target SE using equation (12)
      sigmain           <- sqrt(2)*qnorm(c)
      mu                <- 0.5*(2*p-1)*(sigmain^2)+log(p/(1-p))
      sigma             <- sqrt((sigmain^2)*(1+p*(1-p)*(sigmain^2)))
      
      f_omega<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)
      
      f_omega_eta<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)*x
      
      f_omega_etasq<-function(x, mu, s) 1/(2*s**2*pi)**0.5 * exp(-(x-mu)**2/(2*s**2))*((1+exp(-x))^-1)*(1-(1+exp(-x))^-1)*x^2
      
      mean_omega=integrate(f_omega, -Inf, Inf, mu=mu, s=sigma)$value
      mean_omega_eta_sq <-integrate(f_omega_etasq, -Inf, Inf, mu=mu, s=sigma)$value
      mean_omega_eta    <-integrate(f_omega_eta, -Inf, Inf, mu=mu, s=sigma)$value
      numer             <-n*mean_omega
      denom             <-n* mean_omega * n*mean_omega_eta_sq  - (n^2)*mean_omega_eta^2

      events_req_app_ni  <- rbind(req_event_csl_ni,1/(se_true^2*mean_omega)*p)
      
      n0 <- events_req_app # Set start value to supplied lower bound
      i  <- 1 
      
      if (c==0.64)   {n1 <- ceiling(events_req_app*0.9) ; n2<-ceiling(events_req_app*1.1)}  
      if (c==0.72)   {n1 <- ceiling(events_req_app*0.9) ; n2<-ceiling(events_req_app*1.1)}
      if (c==0.8)    {n1 <- ceiling(events_req_app*0.9) ; n2<-ceiling(events_req_app*1.1)}
      if (c==0.85)   {n1 <- ceiling(events_req_app*1) ;  n2<-ceiling(events_req_app*1.35)}
      if (c==0.9)    {n1 <- ceiling(events_req_app*1) ;  n2<-ceiling(events_req_app*1.5)}
      
      
      f <- function(x) abs(se_true -sd(replicate(15000,cs_in_l(x,p,c))))
      # Check the upper and lower bounds to see if approximations result in 0
      d1 <- abs(se_true -sd(replicate(15000,cs_in_l(n1,p,c))))
      d2 <- abs(se_true -sd(replicate(15000,cs_in_l(n2,p,c))))
      
      d <- 1
      i <- 1
      while (i < 100 && d > 0.001) {
        if (d1 > d2) {
          n1   <- ceiling((n1+n2)/2)
          se1  <- sd(replicate(15000,cs_in_l(n1,p,c)))
          d1    <- abs(se_true -se1)
          d     <- d1
          events_req      <- n1
          se_emp <- se1
        }
        else{
          n2   <- ceiling((n1+n2)/2)
          se2  <- sd(replicate(15000,cs_in_l(n2,p,c)))
          d2   <- abs(se_true -se2) 
          d    <- d2
          events_req    <- n2
          se_emp <- se2
        }
        print(i)
        i = i+1
        events_req
        se_emp
      }
      
      a_sum <- c(p,c,se_true, events_req, events_req_app,events_req_app_ni,events_req_app/events_req,events_req_app_ni/events_req ,se_emp)
      
      req_event_csl<-data.frame(rbind(req_event_csl, a_sum))
      print(c)
    }}}

colnames(req_event_csl) <- c("p","c","se_true", "events_req", "events_req_app", "events_req_app_ni", "events_req_app/events_req",  "events_req_app_ni/events_req", "se_emp_cs", "se_app")



#########################################################################################################################################################################################
########################################################Power and type 1 error###########################################################################################################
#########################################################################################################################################################################################

power_c <- function(p0=for_c0[1],p1=for_c1[1],c0=for_c0[2],c1=for_c1[2],mu=for_c1[3], sigma=for_c1[4],alpha=0.05,beta=0.1) {
  
  #alpha = type I error, significance level
  #beta  = type-II error, 1-beta=power
  #H0: c=c0
  #H1: c=c1
  
  d=c1-c0
  
  sd_c0=sqrt((c0-2*T.Owen(-qnorm(c0),1/sqrt(3))-c0^2) /(p0-p0^2))
  sd_c1=sqrt((c1-2*T.Owen(-qnorm(c1),1/sqrt(3))-c1^2) /(p1-p1^2))
  
  #Formula for ss calculation
  
  n <- ceiling(((qnorm(1-alpha)*sd_c0 + qnorm(1-beta)*sd_c1))^2/d^2) ;n
  
  #Generate data under the alternative hypothesis
  
  eta    <- rnorm(n,mu,sigma)
  y      <- rbinom(n,1,invlogit(eta))
  
  c      <- roc(y,eta,quiet=TRUE,ci=TRUE)
  c_est  <- as.vector(c$auc) 
  se_c   <- (c$ci[3]-c_est)/qnorm(0.975)
  c_est+qnorm(1-alpha) *se_c
  cov_c  <- ifelse( (c_est-qnorm(1-alpha) *se_c) >= c0 , 1, 0)
  n1<-ceiling(n*p0)
  
  out<-c(mean(y),c0,c1,c_est,n1,cov_c)
  out
}

type1_error_c<- function(p0=for_c0[1],p1=for_c1[1],c0=for_c0[2],c1=for_c1[2],mu=for_c0[3], sigma=for_c0[4],alpha=0.05,beta=0.1) {
  
  #alpha = type I error, significance level
  #beta  = type-II error, 1-beta=power
  #H0: c=c0
  #H1: c=c1
  
  d=c1-c0
  
  sd_c0=sqrt((c0-2*T.Owen(-qnorm(c0),1/sqrt(3))-c0^2) /(p0-p0^2))
  sd_c1=sqrt((c1-2*T.Owen(-qnorm(c1),1/sqrt(3))-c1^2) /(p1-p1^2))
  
  n <- ceiling(((qnorm(1-alpha)*sd_c0 + qnorm(1-beta)*sd_c1))^2/d^2) ;n
  
  n1<-ceiling(n*p0)
  
  #Generate data under the NULL hypothesis
  
  eta  <- rnorm(n,mu,sigma)
  y    <- rbinom(n,1,invlogit(eta))
  
  c     <- roc(y,eta,quiet=TRUE,ci=TRUE)
  c_est <- as.vector(c$auc)     
  se_c  <- (c$ci[3]-c_est)/qnorm(0.975)
  
  cov_c <- ifelse( (c_est+qnorm(1-alpha) *se_c) <= c0 , 1, 0)
  out   <- c(mean(y),c0,c1,c_est,n1,cov_c)
  out
  
}


#################

Nsim <- 100000
res<-NULL

for (dc in c(0.03, 0.05)){
  
  for (p in c(0.05,0.1,0.3)){
    
    for (c in c(0.64, 0.72, 0.8, 0.85)){
      
      print(c(p,c))
      
      for_c0<- meas_true(2000000,p,c,fc=1)
      for_c1<- meas_true(2000000,p,c+dc,fc=1)    
      
      a<-replicate(Nsim, type1_error_c(p0=for_c0[1],p1=for_c1[1],c0=for_c0[2],c1=for_c1[2],mu=for_c0[3], sigma=for_c0[4],alpha=0.05,beta=0.1) ); a<-t(a)
      type1_error_n<-round(colMeans(a),3);type1_error_n
      
      b<-replicate(Nsim, power_c(p0=for_c0[1],p1=for_c1[1],c0=for_c0[2],c1=for_c1[2],mu=for_c1[3], sigma=for_c1[4],alpha=0.05,beta=0.1) ); b<-t(b)
      power_n<-round(colMeans(b),3);power_n
      
      a_sum <- c(dc,p,c,c+dc,type1_error_n[5:6],power_n[6])
      
      res<-rbind(res, a_sum)
    }}}
res_power <- res

res_power<-data.frame(res_power)


colnames(res_power) <- c("Difference","p","c0","c1","nevents", "alpha","power")
View(res_power)