########################################################################################################################################################################################################### ########################################################################################################################################################################################################### ########################################################################################################################################################################################################### # Project : R code for manuscript "Evaluation of Logistic Regression Models and Effect of Covariates for Case-Control Study in RNA-Seq Analysis" # Author : Seung Hoan Choi # Date : Jan/18/2017 # R version : R-3.0.0_gnu-4.4.6 ########################################################################################################################################################################################################### ########################################################################################################################################################################################################### ########################################################################################################################################################################################################### ### Data generation fucntions NBEXPDATA<-function(case,cont,Ngene,dmu,cmu,disp,Ncovs,OR=NULL){ data <- matrix( NA, nrow = (case + cont), ncol =(1 + Ngene + Ncovs)) if(Ncovs>0){cov.names<-paste("covar",1:Ncovs,sep="")}else{cov.names<-NULL} colnames(data) <- c("Phen", paste("gcount",1:Ngene,sep=""), cov.names) data[,1] <- c(rep(1,case),rep(0,cont)) c.prob<-(OR/(OR+1)) for (i in 1:Ngene){ data[, 1 + i] <- c(rnbinom(case, mu = dmu, size = 1/disp), rnbinom(cont, mu = cmu, size = 1/disp)) } if(Ncovs>0){ for (i in 1:Ncovs){ data[,1 + Ngene + i]<-c(rbinom(case,size=1,c.prob),rbinom(cont,size=1,0.5)) } } return(data) } simNBdata<-function(case,design,log2fc,cont.mean,dispersion){ log2fold.list<-c(0,0.3,0.6,1.2,2) dispersion.list<-c(0.01,0.1,0.5,1) simdata<-NULL cont<-ifelse(design=="Balanced",case,ifelses(design=="Unbalanced2",case*2,ifelse(design=="Unbalanced4",case*4,NA))) case.mean<-round((2^(log2fc))*cont.mean) f<-which(log2fold.list==log2fc) i<-which(dispersion.list==dispersion) for ( k in 1:10){ if(log2fc==0){ seed.num<-10000*f+1000*i+100*k}else{seed.num<-10000*(f-1)+1000*i+100*k} set.seed(seed.num) print(seed.num) data<-NBEXPDATA(case=case,cont=cont,Ngene=1000,dmu=case.mean,cmu=cont.mean,disp=dispersion,Ncovs=0) data[1,] simdata[[paste0("sim_",k)]]<-data } return(simdata) } simNBdataCov<-function(case,design,log2fc,cont.mean,dispersion,oddsratio){ log2fold.list<-c(0,0.3,0.6,1.2,2) dispersion.list<-c(0.01,0.1,0.5,1) if(case==10){OR.list<-c(1,1.2,3,5)}else{OR.list<-c(1,1.2,3,5,10) } if(case==10){n.covariates<-5}else{n.covariates<-10} simdata<-NULL cont<-ifelse(design=="Balanced",case,ifelses(design=="Unbalanced2",case*2,ifelse(design=="Unbalanced4",case*4,NA))) case.mean<-round((2^(log2fc))*cont.mean) f<-which(log2fold.list==log2fc) i<-which(dispersion.list==dispersion) for ( k in 1:10){ for (r in 1:(1000/10)){ ###### Keep the seed information to regenerate the same data set in future if(log2fc==0){ seednum<-100000*r+f*10000+1000*i+100*k+10*j+1}else{seednum<-100000*r+(f-1)*10000+1000*i+100*k+10*j+1} print(seednum) set.seed(seednum) ###### Data Generation counts<-NBEXPDATA(case=case,cont=cont,Ngene=10,dmu=case.mean,cmu=cont.mean,disp=dispersion,Ncovs=0) ###### Read the stored Covariate file GenCov<-NBEXPDATA(case=case,cont=cont,Ngene=1,dmu=case.mean,cmu=cont.mean,disp=dispersion,Ncovs=n.covariates,OR=oddsratio) data<-cbind(counts,GenCov[,c(3:ncol(GenCov))]) ####### save at the list simdata[[paste0("sim_",k,"-",r)]]<-data } } return(simdata) } ##### ##### Logistic regresion vs. Negative binomial regression simulateion study ##### simulation parameters ##### ncases<-c(10,25,75,500) #### the number of cases study.design<-c("Balanced","Unbalanced2","Unbalanced4") #### study design log2foldchange<-c(0,0.3,0.6,1.2,2) #### log2 fold-change mean.exp.control<-c(50,100,1000,10000) #### mean expression value in controls dispersion.value<-c(0.01,0.1,0.5,1) #### dispersion value i= ### ncases list from 1 to 4 j= ### study.design list from 1 to 3 k= ### log2foldchange list from 1 to 4 l= ### mean.exp.control list from 1 to 4 m= ### dispersion.value list from 1 to 4 ##### ##### ##### TypeI error study ##### out<-simNBdata(case=ncases[i],design=study.design[j],log2fc=log2foldchange[1],cont.mean=mean.exp.control[l],dispersion=dispersion.value[m]) ##### ##### ##### power error study ##### out<-simNBdata(case=ncases[i],design=study.design[j],log2fc=log2foldchange[k],cont.mean=mean.exp.control[l],dispersion=dispersion.value[m]) ##### ##### Effect of Covariate simulateion study ##### simulation parameters ##### ncases<-c(10,25,75,500) #### the number of cases study.design<-c("Balanced","Unbalanced2","Unbalanced4") #### study design log2foldchange<-c(0,0.3,0.6,1.2,2) #### log2 fold-change mean.exp.control<-c(50,100,1000,10000) #### mean expression value in controls dispersion.value<-c(0.01,0.1,0.5,1) #### dispersion value covariate.oddsratio<-c(1,1.2,3,5,10) #### Covariate odd ratios i= ### ncases list from 1 to 4 j= ### study.design list from 1 to 3 k= ### log2foldchange list from 1 to 4 l= ### mean.exp.control list from 1 to 4 m= ### dispersion.value list from 1 to 4 n= ### covariate.oddsratio list from 1 to 5. If the number of cases is 10, list from 1 to 4 ##### ##### ##### TypeI error study ##### out<-simNBdataCov(case=ncases[i],design=study.design[j],log2fc=log2foldchange[1],cont.mean=mean.exp.control[l],dispersion=dispersion.value[m],oddsratio=covariate.oddsratio[n]) ##### ##### ##### power error study ##### out<-simNBdataCov(case=ncases[i],design=study.design[j],log2fc=log2foldchange[k],cont.mean=mean.exp.control[l],dispersion=dispersion.value[m],oddsratio=covariate.oddsratio[n]) ##### sessionInfo() quit("no")