########################################## FUNCTIONS ################################################################### #******************************************************************************************************************* # FUNCTIONS CLASS COMPARISON #******************************************************************************************************************* ### Fold Change for each feature f.FC <- function(X,Y){ X.h <- apply(X[,!is.na(Y) & Y==1],1,mean); X.nh <- apply(X[,!is.na(Y) & Y==-1],1,mean) FC <- 2^(X.h-X.nh) } ### T-TEST p-values for each feature f.T <- function(X,Y) { out <- NULL X.h <- X[,Y==1];X.nh <- X[,Y==-1] # X.h : X hemolyzed; X.nh : X not hemolyzed for (i in 1:dim(X)[1]) { t.p <- t.test(as.numeric(X.h[i,]), as.numeric(X.nh[i,]))$p.value out <- rbind(out,t.p) } out } ### AD TEST p-values for each feature f.AD <- function(X,Y){ out <- NULL X.h <- X[,Y==1];X.nh <- X[,Y==-1] for (i in 1:dim(X)[1]) { ad.p <- adk.test(as.numeric(X.h[i,]), as.numeric(X.nh[i,]))$adk[4] out <- rbind(out,ad.p) } out } #******************************************************************************************************************* # FUNCTIONS CLASS PREDICTION #******************************************************************************************************************* ### STEP 1: BOOTSTRAP MIRNA SELECTION # Chosen methods: "pamr", "rf_boruta", "scad+L2" f.doBS <- function (logX, groupings, fs.methods = c("pamr", "scad+L2", "rf_boruta"), DIR = "bs", seed = 123, seed.boot=123, bstr = 100, saveres = TRUE, jitter = FALSE, maxiter = 1000, maxevals = 500, bounds = NULL, max_allowed_feat = NULL, n.threshold = 50, maxRuns = 300) { print("#=========doBS===========") print(seed.boot) print("#========================") stopifnot(all(fs.methods %in% c("pamr", "rf_boruta", "scad", "scad+L2", "1norm", "DrHSVM"))) if (!file.exists(DIR)) dir.create(DIR) results <- list() for (i in 1:length(fs.methods)) { if (fs.methods[i] == "pamr") { paramsPAMR <- list(seed = seed, bstr = bstr, max_allowed_feat = max_allowed_feat, n.threshold = n.threshold, saveres = saveres, jitter = jitter, seed.boot=seed.boot) results[["pamr"]] <- f.bsPAMR(logX, groupings, DIR, paramsPAMR) } else if (fs.methods[i] == "rf_boruta") { paramsRFBORUTA <- list(seed = seed, bstr = bstr, maxRuns = maxRuns, saveres = saveres, jitter = jitter, seed.boot=seed.boot) results[["rf_boruta"]] <- f.bsRFBORUTA(logX, groupings, DIR, paramsRFBORUTA) } else { paramsSCAD <- list(seed = seed, bstr = bstr, bounds = bounds, maxevals = maxevals, maxiter = maxiter, saveres = saveres, jitter = jitter, fs.method = fs.methods[i], seed.boot=seed.boot) results[[fs.methods[i]]] <- f.bsSCAD(logX, groupings, DIR, paramsSCAD) } } results } f.bsPAMR <- function (logX, groupings, DIR, params = NULL){ if (is.null(params)) { params <- list(seed = 123, bstr = 100, ncv = 5, max_allowed_feat = 500, n.threshold = 30, saveres = TRUE, jitter = FALSE) } if (params$jitter) { logX <- jitter(logX) } fs.method <- "pamr" seed <- params$seed seed.boot <- params$seed.boot print("#=========bsPAMR=========") print(seed.boot) print("#========================") bstr <- params$bstr ncv <- params$ncv max_allowed_feat <- params$max_allowed_feat n.threshold <- params$n.threshold saveres <- params$saveres SUBDIR <- paste(DIR, fs.method, sep = "/") if (!file.exists(SUBDIR)) dir.create(SUBDIR) fnames <- paste(SUBDIR, "/", names(groupings), ".pdf", sep = "") X <- lapply(1:length(groupings), function(i, groupings, fnames) list(groupings[[i]], fnames[i]), groupings = groupings, fnames = fnames) names(X) <- names(groupings) seedo <- seed pam_bstr <- list() for (i in 1:length(X)) { datX <- logX datY <- X[[i]][[1]] nasY <- which(is.na(datY)) nasX <- which(apply(datX, 1, function(x) all(is.na(x)))) nas <- unique(c(nasY, nasX)) if (length(nas) > 0) { datX <- datX[-nas, ] datY <- datY[-nas] } pams <- list() seed <- seedo for (rp in 1:bstr) { print(rp) seed <- seed + 1 seed.boot <- seed.boot+1 dat_bstr <- f.select_bootstrap_data(datX, datY, seed.boot) ypam <- dat_bstr[["datY"]] xpam <- t(dat_bstr[["datX"]]) pamdat <- list(x = xpam, y = ypam) histtr <- pamr.train(pamdat, n.threshold = n.threshold) histcv <- pamr.cv(histtr, pamdat, nfold = ncv) tmin <- select_threshold(histcv, max_allowed_feat = max_allowed_feat) selected <- pamr.predict(histtr, pamdat$x, tmin, type = "nonzero") selected_names <- rownames(pamdat$x)[selected] pams[[rp]] <- list(histtr = histtr, histcv = histcv, tmin = tmin, selected = selected, selected_names = selected_names) } pam_bstr[[names(X)[i]]] <- pams } attr(pam_bstr, "fs.method") <- "pamr" ig <- makeIG(pam_bstr, SUBDIR, prob = 0.975) if (saveres) { save(pam_bstr, ig, params, file = paste(SUBDIR, "PAM_RData.RData", sep = "/")) } invisible(pam_bstr) } f.bsRFBORUTA <- function (logX, groupings, DIR, params = NULL){ if (is.null(params)) { params <- list(seed = 123, bstr = 100, maxRuns = 300, saveres = TRUE, jitter = FALSE) } if (params$jitter) { logX <- jitter(logX) } fs.method <- "rf_boruta" seed <- params$seed seed.boot <- params$seed.boot print("#=======bsRFBORUTA=======") print(seed.boot) print("#========================") bstr <- params$bstr maxRuns <- params$maxRuns saveres <- params$saveres SUBDIR <- paste(DIR, fs.method, sep = "/") if (!file.exists(SUBDIR)) dir.create(SUBDIR) fnames <- paste(SUBDIR, "/", names(groupings), ".pdf", sep = "") X <- lapply(1:length(groupings), function(i, groupings, fnames) list(groupings[[i]], fnames[i]), groupings = groupings, fnames = fnames) names(X) <- names(groupings) useparallel <- length(grep("package:(parallel|multicore)", search()) > 0) if (length(X) > 1 & useparallel) { rfs_bstr <- mclapply(X, f.rf_multi, datX = logX, maxRuns = maxRuns, seed = seed, bstr = bstr, mc.preschedule = TRUE, mc.cores = length(X), seed.boot=seed.boot) } else { rfs_bstr <- lapply(X, f.rf_multi, datX = logX, maxRuns = maxRuns, seed = seed, bstr = bstr,seed.boot=seed.boot) } attr(rfs_bstr, "fs.method") <- "rf_boruta" ig <- makeIG(rfs_bstr, SUBDIR, prob = 0.975) if (saveres) { save(rfs_bstr, ig, params, file = paste(SUBDIR, "RF_RData.RData", sep = "/")) } invisible(rfs_bstr) } f.rf_multi <- function (X, datX, maxRuns = 500, seed = 123, bstr = 100,seed.boot=123){ print("#=======rf_multi=========") print(seed.boot) print("#========================") datY <- X[[1]] nasY <- which(is.na(datY)) nasX <- which(apply(datX, 1, function(x) all(is.na(x)))) nas <- unique(c(nasY, nasX)) if (length(nas) > 0) { datX <- datX[-nas, ] datY <- datY[-nas] } rfs <- list() for (rp in 1:bstr) { print(rp) seed <- seed + 1 seed.boot <- seed.boot + 1 dat_bstr <- f.select_bootstrap_data(datX, datY,seed.boot) bres <- Boruta(x = as.data.frame(dat_bstr$datX), y = factor(dat_bstr$datY), doTrace = 2, maxRuns = maxRuns) selprobes <- gsub("`", "", names(bres$finalDecision[which(bres$finalDecision != "Rejected")])) rfs[[rp]] <- list(bres = bres, selprobes = selprobes) } rfs } f.bsSCAD <- function (logX, groupings, DIR, params = list(seed = 123, bstr = 100, maxiter = 1000, maxevals = 500, bounds = NULL, saveres = TRUE, jitter = FALSE, fs.method = "scad+L2")){ if (params$jitter) { logX <- jitter(logX) } fs.method <- params$fs.method seed <- params$seed seed.boot <- params$seed.boot print("#=======bsSCAD===========") print(seed.boot) print("#========================") bstr <- params$bstr bounds <- params$bounds maxiter <- params$maxiter maxevals <- params$maxevals saveres <- params$saveres SUBDIR <- paste(DIR, fs.method, sep = "/") if (!file.exists(SUBDIR)) dir.create(SUBDIR) fnames <- paste(SUBDIR, "/", names(groupings), ".pdf", sep = "") X <- lapply(1:length(groupings), function(i, groupings, fnames) list(groupings[[i]], fnames[i]), groupings = groupings, fnames = fnames) names(X) <- names(groupings) useparallel <- length(grep("package:(parallel|multicore)", search()) > 0) if (length(X) > 1 & useparallel) { scad_bstr <- mclapply(X, f.bstr_multi, datX = logX, bstr = bstr, seed = seed, fs.method = fs.method, bounds = bounds, maxiter = maxiter, maxevals = maxevals, mc.preschedule = TRUE, mc.cores = length(X), seed.boot=seed.boot) } else { scad_bstr <- lapply(X, f.bstr_multi, datX = logX, bstr = bstr, seed = seed, fs.method = fs.method, bounds = bounds, maxiter = maxiter, maxevals = maxevals,seed.boot=seed.boot) } attr(scad_bstr, "fs.method") <- fs.method ig <- makeIG(scad_bstr, SUBDIR, prob = 0.975) #Quantile in [0,1], defines the cutoff, at which frequency of cooccuring #features an edge will be drawn between the two features in the importance graph if (saveres) { save(scad_bstr, ig, params, file = paste(SUBDIR, "SCAD_RData.RData", sep = "/")) } invisible(scad_bstr) } f.bstr_multi <- function (X, datX, bstr, seed = 123, fs.method = "scad+L2", bounds = NULL, maxiter = 1000, maxevals = 500,seed.boot=123){ print("#=======bstr_multi=======") print(seed.boot) print("#========================") datY <- X[[1]] nasY <- which(is.na(datY)) nasX <- which(apply(datX, 1, function(x) all(is.na(x)))) nas <- unique(c(nasY, nasX)) if (length(nas) > 0) { datX <- datX[-nas, ] datY <- datY[-nas] } grid.search <- "interval" lambda1.scad <- lambda2.scad <- NULL bounds <- NULL scads <- list() for (rp in 1:bstr) { print(rp) seed <- seed + 1 seed.boot <- seed.boot + 1 proceed <- TRUE while (proceed) { dat_bstr <- f.select_bootstrap_data(datX, datY,seed.boot) ttt <- try(scad <- my.svm.fs(dat_bstr[[1]], y = dat_bstr[[2]], fs.method = fs.method, bounds = bounds, lambda1.set = lambda1.scad, lambda2.set = lambda2.scad, cross.outer = 0, grid.search = grid.search, maxIter = maxiter, inner.val.method = "cv", cross.inner = 5, maxevals = maxevals, seed = seed, parms.coding = "log2", show = "none", verbose = FALSE)) if (class(ttt) != "try-error") proceed <- FALSE } scads[[rp]] <- scad } scads } f.select_bootstrap_data <- function (datX, datY,seed.boot){ print("#==select_bootstrap_data==") print(seed.boot) print("#=========================") # bootstrap seed set.seed(seed.boot) tdY <- table(datY) a <- sample(which(datY == names(tdY)[1]), tdY[1], replace = TRUE) b <- sample(which(datY == names(tdY)[2]), tdY[2], replace = TRUE) dbstr <- datX[c(a, b), ] rownames(dbstr) <- paste(rownames(dbstr), 1:nrow(dbstr), sep = ".") dYbstr <- datY[c(a, b)] stopifnot(all(dim(datX) == dim(dbstr))) list(datX = as.matrix(dbstr), datY = as.numeric(dYbstr), id_contr=a, id_diseased=b) } ### EGG-SHAPED PLOT f.egg_graph <- function (phi, main = "", highlight = NULL, layout = "layout.ellipsis", pdf = NULL, pointsize = 12, tk = FALSE, node.color = "grey",node.filter = NULL, vlabel.cex = 0.6, vlabel.cex.min = 0.5, vlabel.cex.max = 4, max_node_cex = 8, edge.width = 1, filter = 10, max_edge_cex = 6, ewprop = 3){ edge.color = "lightseagreen" edge.lty = "solid" weights <- phi phix <- ifelse(phi == 0, 0, 1) dw <- diag(weights) phix[weights <= filter] <- 0 weights[weights <= filter] <- 0 diag(phix) <- 0 diag(weights) <- dw ig <- graph.adjacency(phix) ig.nodes <- as.matrix(print.igraph.vs(V(ig))) if (tk) { tkplot(ig, vertex.label = V(ig)$name) return(ig) } vertex.color <- rep(node.color, length(ig.nodes)) if (!is.null(highlight)) { if (class(highlight) == "list") { highlight <- unique(unlist(highlight)) } if (class(highlight) == "numeric") { snodes <- colnames(phi)[highlight] } else { snodes <- highlight } vertex.color[match(snodes, ig.nodes)] <- "darkblue" } if (!all(phix == 0)) { ig.edges <- gsub(" ", "", as.matrix(print.igraph.es(E(ig)))) edge.width <- rep(edge.width, length(ig.edges)) if (!is.null(weights)) { epairs <- strsplit(ig.edges, "->") for (epi in 1:length(epairs)) { epair <- epairs[[epi]] edge.width[epi] <- weights[epair[1], epair[2]] } } print("Setting graph attributes...") E(ig)$color <- edge.color E(ig)$lty <- edge.lty edge.width <- edge.width^ewprop/max(edge.width^ewprop) * max_edge_cex edge.width[is.na(edge.width)] <- 1 E(ig)$width <- edge.width E(ig)$arrow.size <- 0 } V(ig)$color <- vertex.color V(ig)$label.color <- "darkblue" V(ig)$shape <- rep("circle", length(ig.nodes)) nsize <- diag(weights) if (!is.null(node.filter)) { nsize[nsize <= node.filter] <- 0 } nsize <- (nsize - min(nsize))/max((nsize - min(nsize))) * max_node_cex if (all(is.na(nsize))) vlabel.cex <- 1.5 if (any(is.na(nsize))) { nsize[is.na(nsize)] <- 0 } vlabel.cex <- nsize/max(nsize) * vlabel.cex vlabel.cex[vlabel.cex == 0] <- vlabel.cex.min vlabel.cex <- pmin(vlabel.cex, vlabel.cex.max) V(ig)$label.cex <- vlabel.cex V(ig)$size <- nsize print("..and plot.") if (!is.null(pdf)) pdf(pdf, pointsize = pointsize) if (class(layout) != "function") { if (layout == "layout.ellipsis") { lc <- layout.ellipsis(ig, a = 1, b = 1.5) plot(ig, vertex.label = V(ig)$name, layout = lc, main = main, rescale = FALSE, ylim = range(lc[, 2], xlim = range(lc[, 1]))) } } else { plot(ig, vertex.label = V(ig)$name, layout = layout, main = main, rescale = TRUE) } if (!is.null(pdf)) dev.off() invisible(list(ig = ig, layout = layout)) }