# # R and RCytoscape functions for processing phosphoproteomic data. # Author: Mark Grimes, Division of Biological Sciences, University of Montana # Mark.Grimes@mso.umt.edu # rtklist <- c("ALK", "LTK", "AXL", "MER", "TYRO3", "DDR1" , "DDR2", "EGFR", "ERBB2", "ERBB3", "ERBB4", "EPHA1" , "EPHA2", "EPHA3","EPHA4", "EPHA5", "EPHA6", "EPHA7", "EPHA8" , "EPHB1", "EPHB2", "EPHB3" , "EPHB4", "EPHB6", "EPHX", "FGFR1" , "FGFR2", "FGFR3", "FGFR4", "IGF1R" , "INSR", "INSRR", "MET", "RON", "MUSK", "CSF1R" , "FLT3", "KIT", "PDGFRA", "PDGFRB", "PTK7", "RET", "ROR1", "ROR2", "ROS1", "RYK", "TEK", "TIE", "NTRK1" , "NTRK2" ,"NTRK3" ,"VEGFR1", "VEGFR2", "VEGFR3", "AATYK" , "AATYK2", "AATYK3", "STYK1") sfklist <- c("BLK", "FGR", "FRK", "FYN", "HCK", "LCK", "LYN", "PTK6", "SRC", "YES1" ) fungenes <- c(sfklist, rtklist) # Get the plyr package; see http://had.co.nz/plyr/ >install.packages("plyr") # Then run: library(plyr) printf <- function (thing) print(noquote(sprintf(thing))) # Function to select columns from spreadsheets: get.colnames <- function(colnames) { cat("Enter column numbers to keep, one at a time, followed by :","\n","\t") col.names <- read.table(stdin()) col.names<-as.numeric(col.names$V1) return(col.names) } # # use with sapply: get.gene <- function(cell) { fixgenes = c("CDC2", "2-Sep", "3-Sep", "4-Sep", "5-Sep", "7-Sep", "8-Sep", "9-Sep", "1-Oct", "2-Oct", "3-Oct", "4-Oct", "6-Oct", "7-Oct", "11-Oct", "1-Mar", "2-Mar", "3-Mar", "4-Mar", "5-Mar", "6-Mar", "7-Mar", "8-Mar", "9-Mar", "10-Mar", "11-Mar", "C11orf58") corrects = c("CDK1", "SEPT2", "SEPT3", "SEPT4", "SEPT5", "SEPT7", "SEPT8", "SEPT9", "POU2F1", "POU2F2", "POU5F1", "POU5F1", "POU3F1", "POU3F2", "POU2F3", "MARCH1", "MARCH2", "MARCH3", "MARCH4", "MARCH5", "MARCH6", "MARCH7", "MARCH8", "MARCH9", "MARCH10", "MARCH11", "SMAP") x<-unlist(strsplit(as.character(cell), ";")) for (i in 1:length(fixgenes)) { if (x[1] == fixgenes[i]) { x[1] <- corrects[i] return (x[1]) } } return(x[1]) } # # get.gene.2 <- function(cell) { fixgenes = c("CDC2", "2-Sep", "3-Sep", "4-Sep", "5-Sep", "7-Sep", "8-Sep", "9-Sep", "1-Oct", "2-Oct", "3-Oct", "4-Oct", "6-Oct", "7-Oct", "11-Oct", "1-Mar", "2-Mar", "3-Mar", "4-Mar", "5-Mar", "6-Mar", "7-Mar", "8-Mar", "9-Mar", "10-Mar", "11-Mar", "C11orf58", 'C17orf57', 'C3orf10', 'C7orf51') corrects = c("CDK1", "SEPT2", "SEPT3", "SEPT4", "SEPT5", "SEPT7", "SEPT8", "SEPT9", "POU2F1", "POU2F2", "POU5F1", "POU5F1", "POU3F1", "POU3F2", "POU2F3", "MARCH1", "MARCH2", "MARCH3", "MARCH4", "MARCH5", "MARCH6", "MARCH7", "MARCH8", "MARCH9", "MARCH10", "MARCH11", "SMAP", "EFCAB13", "BRK1", "NYAP1") x<-unlist(strsplit(as.character(cell), ";")) for (i in 1:length(fixgenes)) { if (x[1] == fixgenes[i]) { x[1] <- corrects[i] return (x[1]) } } if ((length(x)>1) & !(x[1] %in% datafile$Gene.Name) & (x[2] %in% datafile$Gene.Name)) return( x[2]) else return(x[1]) } #__________ pick.unique <- function(genes) { gene.v <- unlist(strsplit(as.character(genes), ";")) gene.v <- gsub(" ", "", gene.v, fixed=F) genes <- paste(unique(gene.v), collapse=";") return (genes) } # Functions to use org.Hs.egUNIPROT, then org.Hs.egGENENAME to go from UNIPROT to ENTREZ to GENENAME library('org.Hs.eg.db') sym2geneID <- function (sym) { if (sym %in% (keys(org.Hs.egSYMBOL2EG))) { return(org.Hs.egSYMBOL2EG[[sym]]) } else return (NA) } geneID2sym <- function (geneID) { if (geneID %in% (keys(org.Hs.egSYMBOL))) { return(org.Hs.egSYMBOL[[geneID]]) } else return (NA) } UNI.tab <- toTable(org.Hs.egUNIPROT) uni2geneID <- function (uni) { x<-unlist(strsplit(as.character(uni), ";")) uni <- x[1] if (uni %in% UNI.tab$uniprot_id) { generow <- which(UNI.tab$uniprot_id == as.character(uni)) return(UNI.tab[generow[1], 1]) } else return (NA) } # e.g. uni="P12931" = SRC # strippercent <- function(residue.raw) { residue <- gsub("\\%", "\\", residue.raw, fixed=F) return(residue) } stripbar <- function(cell) { modcell <- gsub("\\|", "\\;", cell, fixed=F) return(modcell) } stripdash <- function(cell) { modcell <- gsub("\\-", "\\", cell, fixed=F) return(modcell) } subsemi <- function(cell) { modcell <- gsub("\\;", "\\,", cell, fixed=F) return(modcell) } strip.t <- function(cell) { modcell <- gsub("\\t", "\\", cell, fixed=F) return(modcell) } strip.sigma <- function(cell) { modcell <- gsub("\\§", "\\", cell, fixed=F) return(modcell) } stripslash <- function(cell) { modcell <- gsub("\\\"", "\\", cell, fixed=F) return(modcell) } # get.sites <- function (cell) { modcell <- strippercent(cell) cell <- modcell modcell <- strip.sigma(cell) cell <- modcell modcell <- strip.t(cell) cell <- modcell modcell <- subsemi(cell) cell <- modcell modcell <- stripslash(cell) cell <- modcell modcell <- stripslash(cell) return(modcell) } ## calc.ratio <- function(control, treated) { if (control==treated) { ratio=1 return(ratio) } else if (control > treated) { if (treated==0) { ratio=-50 return(ratio) } else if (control==0) { ratio=-10 return(ratio) } else if ((control < 0) & (treated < 0)) { ratio <- -(treated/control) return(ratio) } else if (treated < 0) { diff.f <- (treated - control) # signal is the difference; a negative number denom.f <- (treated + control) # positive if treated is more neg than control pos if (denom.f > 0) { ratio=-(diff.f/treated) return(ratio) } else if (denom.f < 0) { ratio=(diff.f/control) return(ratio) } } else ratio <- -1/(treated/control) return(ratio) } if (control < treated) { if (control==0) { ratio=50 return(ratio) } else if (treated==0) { ratio=10 return(ratio) } else if ((control < 0) & (treated < 0)) { ratio <- 1/(treated/control) return(ratio) } else if (control < 0) { diff.f <- (treated - control) # a positive number denom.f <- (treated + control) # positive if control is more neg than treated pos if (denom.f > 0) { ratio=-(diff.f/control) return(ratio) } else if (denom.f < 0) { ratio=(diff.f/treated) return(ratio) } } else ratio <- (treated/control) return(ratio)} } # # The following is for merging data from two runs of the same sample # use: datafile$SY5Y <- mapply(merge2cols, colv1=datafile[,7], colv2=datafile[,9]) # merge2cols <- function (colv1, colv2) { newcolv=NA if (is.na(colv1) & is.na(colv2)) { newcolv=NA return(newcolv)} else if (is.na(colv1) | is.na(colv2)) { newcolv <- sum(colv1, colv2, na.rm=TRUE) return(newcolv) } else if (colv1 == 0 | colv2 == 0) { newcolv <- colv1 + colv2 return(newcolv)} else newcolv <- (colv1 + colv2)/2 return(newcolv) } # # Map HUGO names from PSP names # map.HUGO.name <- function(pspname, species) { PSProw <- which((PSP.map$PSP_NAME == as.character(pspname)) & (PSP.map$ORGANISM == as.character(species))) if (length(PSProw) >= 1) { mapped.name <-as.character(PSP.map[PSProw, "GENE_SYMBOL"]) return(mapped.name[1]) } else return("not ID mapped") } # # NA.zero <- function(data.file) { cf <- data.file[2:ncol(data.file)] zer0 <- which(cf==0, arr.ind = TRUE) cfNA <- as.matrix(cf) # makes "numbers into characters" unless Gene.Name col is removed cfNA <- replace (cfNA, zer0, NA) cf <- cbind(data.file[,1], data.frame(cfNA)) names(cf)[1] <- "Gene.Name" return(cf) } # But for correlations, need a matrix cor.NA <- function(cytoscape.file) { cf <- cytoscape.file[2:ncol(cytoscape.file)] zer0 <- which(cf==0, arr.ind = TRUE) cfNA <- as.matrix(cf) # makes "numbers into characters" unless Gene.Name col is removed cfNA <- replace (cfNA, zer0, NA) cf.cor <- cor(cfNA, use = "pairwise.complete.obs", method = "pearson") # cr.cor <- cor(cfNA, use = "na.or.complete", method = "pearson") # need to understand difference here return(cf.cor) } # zero.NA <- function(cytoscape.file) { cf <- cytoscape.file[2:ncol(cytoscape.file)] cf[is.na(cf)] <- 0 cf <- cbind(cytoscape.file$First.Gene.Name, cf) names(cf)[1] <- "First.Gene.Name" return(cf) } # prep.tbl <- function (cytoscape.file) { # Prune data file for distance matrix / omit Total print(names(cytoscape.file)) cat("\n","\t", "Set data columns for correlation calucations. ","\n","\t","First column number? ") firstcolnumber <- as.numeric(readLines(con = stdin(), n = 1)) cat("\n","\t","Last column number? ") lastcolnumber <- as.numeric(readLines(con = stdin(), n = 1)) datacols=c(firstcolnumber:lastcolnumber) datacol.names <- names(cytoscape.file[datacols]) cat("\t", "data columns", "\n") print(datacol.names) cydata <-cytoscape.file[, c(datacols)] cytbl <- NA.zero(cydata) tbl=data.matrix(cytbl) rownames (tbl) = cytoscape.file$First.Gene.Name return(tbl) } dist.Euclid <- function (tbl) { # Compute Distance Matrix dm = as.matrix (dist (tbl), method = "euclidean") # default method # set NA to two orders of magnitude higher than max distance dmz <- dm dmz[is.na(dmz)] <- 100*max(dmz, na.rm=T) dmz.ord <- cmdscale(dmz, k=3) dev.new() plot(dmz.ord, main="Euclidian Distance", xlab="") return(dmz.ord) } # dist.Pearson <- function (tbl) { # Use 1 - abs(Pearson Correlation) as distance cf.cor <- cor(t(tbl), use = "pairwise.complete.obs", method = "pearson") dissimilarity <- 1 - abs(cf.cor) distance <- as.dist(dissimilarity) dpz <- distance # set NA to two orders of magnitude higher than max distance dpz[is.na(dpz)] <- 100*max(dpz, na.rm=T) dpz.ord <- cmdscale(dpz, k=3) dev.new() plot(dpz.ord, main="Dissimilarity = 1 - Abs(Pearson Correlation)", xlab="") return(dpz.ord) } # dist.Spearman <- function (tbl) { # Use 1 - abs(Spearman Correlation) as distance cf.cor <- cor(t(tbl), use = "pairwise.complete.obs", method = "spearman") dissimilarity <- 1 - abs(cf.cor) distance <- as.dist(dissimilarity) dpz <- distance # set NA to two orders of magnitude higher than max distance dpz[is.na(dpz)] <- 100*max(dpz, na.rm=T) dpz.ord <- cmdscale(dpz, k=3) dev.new() plot(dpz.ord, main="Dissimilarity = 1 - Abs(Spearman Correlation)", xlab="") return(dpz.ord) } # getpdbid <- function(pdbfile) { sel <- read.pdb(pdbfile) sel.df <- data.frame(sel$atom) resno <- sel.df$resno sel.id <- key[key$g.number %in% resno, ] return (sel.id$Gene.Name) } # nmissing <- function(x) sum(is.na(x)) filled <- function (x) {length(x) - nmissing(x)} max.na <- function(x) max(x, na.rm=TRUE) min.na <- function(x) min(x, na.rm=TRUE) sum.na <- function(x) sum(x, na.rm=TRUE) sd.na <- function(x) sd(x, na.rm=TRUE) # clust.eval <- function(clusterlist, tbl.sc) { evaluation <- data.frame(0) names(evaluation)[1] <- "Group" key <- data.frame(1:length(rownames(tbl.sc))) key$Gene.Name <- rownames(tbl.sc) for (i in 1:length(clusterlist)) { cat("Starting Group", i, "\n") evaluation[i,1] <- i # evaluation$no.genes[i] <- length(clusterlist[[i]]$Gene.Name) # at = data.frame(tbl.sc[key$Gene.Name %in% clusterlist[[i]]$Gene.Name, ]) # at <- at[-which(apply(at, 1, filled) == 0),] acol <- names(at[,which(numcolwise(filled)(at) != 0)]) evaluation$no.samples[i] <- length(acol) at <- at[, acol] evaluation$total.signal[i] <- sum(at, na.rm=TRUE) if ((length (acol) == 1) || (length(clusterlist[[i]]$Gene.Name) == 1)) { evaluation$culled.by.slope[i] <- length(clusterlist[[i]]$Gene.Name) evaluation$percent.NA[i] <- 0 evaluation$percent.singlesamplegenes[i] <- 100 evaluation$percent.singlegenesamples[i] <- 100 } else { evaluation$percent.NA[i] <- 100*(sum(numcolwise(nmissing)(at)) / (dim(at)[1]*dim(at)[2])) singlesamplegenes <- at[which(apply(at, 1, filled) == 1),] evaluation$percent.singlesamplegenes[i] <- 100*(nrow(singlesamplegenes) / dim(at)[1]) singlegenesamples <- sum(numcolwise(filled)(at) == 1) evaluation$percent.singlegenesamples[i] <- 100*(singlegenesamples/dim(at)[2]) cluster.mo <- at[order(-as.vector(colwise(sum.na)(data.frame(t(at))))), order(-as.vector(numcolwise(sum.na)(data.frame(at))))] slope <- apply(cluster.mo, 1, get.slope.a) badslope <- c(names(which(is.na(slope))), names(which(slope > 0))) evaluation$culled.by.slope[i] <- length(badslope) # cat("\n", length(badslope), "genes culled by slope", "\n") } } # Total signal scaled to percent NA = intensity cleargenes <- evaluation$no.genes - evaluation$culled.by.slope # may be 0 realsamples <- evaluation$no.samples - (evaluation$no.samples * evaluation$percent.singlegenesamples/100) # may be 0 intensity <- evaluation$total.signal - (evaluation$total.signal * evaluation$percent.NA/100) # calibrate intensity according to real samples and clear genes # - goal is to reward tightly focussed groups, but not too small or too big evaluation$Index <- intensity * (1 + realsamples) * (1 + cleargenes) / (1 + evaluation$percent.NA) eval.sort <- evaluation[order(-evaluation$Index, evaluation$percent.NA), c("Group", "no.genes", "culled.by.slope", "percent.singlesamplegenes","no.samples", "percent.singlegenesamples", "total.signal", "percent.NA", "Index" )] return(eval.sort) } # end clust.eval # singleclust.eval <- function(cluster.df, tbl.sc) { evaluation <- data.frame(0) names(evaluation)[1] <- "Group" key <- data.frame(1:length(rownames(tbl.sc))) key$Gene.Name <- rownames(tbl.sc) # for (i in 1:length(clusterlist)) { # cat("Starting Group", i, "\n") i=1 evaluation[i,1] <- i # evaluation$no.genes[i] <- length(cluster.df$Gene.Name) # at = data.frame(tbl.sc[key$Gene.Name %in% cluster.df$Gene.Name, ]) acol <- names(at[,which(numcolwise(filled)(at) != 0)]) evaluation$no.samples[i] <- length(acol) at <- at[, acol] evaluation$total.signal[i] <- sum(at, na.rm=TRUE) if ((length (acol) == 1) || (length(cluster.df$Gene.Name) == 1)) { evaluation$culled.by.slope[i] <- length(clusterlist[[i]]$Gene.Name) evaluation$percent.NA[i] <- 0 evaluation$percent.singlesamplegenes[i] <- 100 evaluation$percent.singlegenesamples[i] <- 100 } else { evaluation$percent.NA[i] <- 100*(sum(numcolwise(nmissing)(at)) / (dim(at)[1]*dim(at)[2])) singlesamplegenes <- at[which(apply(at, 1, filled) == 1),] evaluation$percent.singlesamplegenes[i] <- 100*(nrow(singlesamplegenes) / dim(at)[1]) singlegenesamples <- sum(numcolwise(filled)(at) == 1) evaluation$percent.singlegenesamples[i] <- 100*(singlegenesamples/dim(at)[2]) cluster.mo <- at[order(-as.vector(colwise(sum.na)(data.frame(t(at))))), order(-as.vector(numcolwise(sum.na)(data.frame(at))))] slope <- apply(cluster.mo, 1, get.slope.a) badslope <- c(names(which(is.na(slope))), names(which(slope > 0))) evaluation$culled.by.slope[i] <- length(badslope) # cat("\n", length(badslope), "genes culled by slope", "\n") } # Total signal scaled to percent NA = intensity cleargenes <- evaluation$no.genes - evaluation$culled.by.slope # may be 0 realsamples <- evaluation$no.samples - (evaluation$no.samples * evaluation$percent.singlegenesamples/100) # may be 0 intensity <- evaluation$total.signal - (evaluation$total.signal * evaluation$percent.NA/100) # calibrate intensity according to real samples and clear genes # - goal is to reward tightly focussed groups, but not too small or too big evaluation$Index <- intensity * (1 + realsamples) * (1 + cleargenes) / (1 + evaluation$percent.NA) eval.sort <- evaluation[order(-evaluation$Index, evaluation$percent.NA), c("Group", "no.genes", "culled.by.slope", "percent.singlesamplegenes","no.samples", "percent.singlegenesamples", "total.signal", "percent.NA", "Index" )] return(eval.sort) } # end singleclust.eval # clust.data <- function(clusterlist, tbl) { clust.list <- as.list(NULL) key <- data.frame(1:length(rownames(tbl))) key$Gene.Name <- rownames(tbl) for (i in 1:length(clusterlist)) { at = data.frame(tbl[key$Gene.Name %in% clusterlist[[i]]$Gene.Name, ]) acol <- names(at[,which(numcolwise(filled)(at) != 0)]) if(length(acol) == 1) { ats <- data.frame(cbind (rownames(at), as.numeric(at[, acol]))) names(ats) <- c("Gene.Name", acol) } if(length(acol) >= 2) { ats <- cbind(rownames(at), at[, acol]) names(ats)[1] <- "Gene.Name" } clust.list[[i]] <- ats } return (clust.list) } # clust.data.sc <- function(clusterlist) { clust.list <- as.list(NULL) for (i in 1:length(clusterlist)) { at = data.frame(tbl.sc[key$Gene.Name %in% clusterlist[[i]]$Gene.Name, ]) acol <- names(at[,which(numcolwise(filled)(at) != 0)]) if(length(acol) == 1) { ats <- data.frame(cbind (rownames(at), as.numeric(at[, acol]))) names(ats) <- c("Gene.Name", acol) } if(length(acol) >= 2) { ats <- cbind(rownames(at), at[, acol]) names(ats)[1] <- "Gene.Name" } clust.list[[i]] <- ats } return (clust.list) } # clust.data.from.vector <- function(vector, tbl) { key <- data.frame(1:length(rownames(tbl))) key$Gene.Name <- rownames(tbl) at = data.frame(tbl[key$Gene.Name %in% vector, ]) acol <- names(at[,which(numcolwise(filled)(at) != 0)]) if(length(acol) == 1) { ats <- data.frame(cbind (rownames(at), as.numeric(at[, acol]))) names(ats) <- c("Gene.Name", acol) } if(length(acol) >= 2) { ats <- cbind(rownames(at), at[, acol]) names(ats)[1] <- "Gene.Name" } clust.data <- ats return (clust.data) } # end clust.data.from.vector # # ## This function takes the argument of a cluster data index, e.g., span3.clusters[26], from clust.data # Graphs heat map and returns ordered data frame for saving in file # requires library ("gplots") graph.cluster <- function(cluster.data.index) { cluster.df <- data.frame(cluster.data.index) if (ncol(cluster.df) <= 2) { cat ("\n","This is a single sample cluster!", "\n") return (cluster.df) } else { cluster.m <- data.matrix(cluster.df[,2:ncol(cluster.df)]) rownames(cluster.m) <- cluster.df$Gene.Name cluster.mo <- cluster.m[order(-as.vector(colwise(sum.na)(data.frame(t(cluster.m))))), order(-as.vector(numcolwise(sum.na)(data.frame(cluster.m))))] cluster.dftemp <- data.frame(cluster.mo) Gene.Name <- as.vector(rownames(cluster.mo)) cluster.dfo <- base::cbind(Gene.Name, cluster.dftemp) cluster.mo[is.na(cluster.mo)] <- 0 rbyheatcolors <- colorRampPalette(colors=c('#3333FF', '#FFFF00'), bias=0.25, space="Lab", interpolate = "linear") # royal blue to yellow palette(c("black", rbyheatcolors(max(cluster.mo)/min(cluster.m, na.rm=TRUE)))) dev.new() # dev.new(width=2+0.1*ncol(cluster.mo), height=2.5+0.125*nrow(cluster.mo)) # can resize manually heatmap.2(cluster.mo, dendrogram="none", trace="none", labRow=cluster.dfo$Gene.Name, labCol=colnames(cluster.mo), Rowv=NA, Colv=NA, hclustfun=NULL, scale="none", col=palette(), colsep=NULL, rowsep=NULL, sepwidth=c(0,0), revC=FALSE, keysize=1.8, density.info='histogram', denscol="green", main="Cluster Heatmap", margins=c(5.2, 6)) return (cluster.dfo) } } # end graph.cluster # make.clusterlist <- function(tsnedata, toolong, tbl.sc) { tsne.span2 <- spantree(dist(tsnedata), toolong=toolong) tsnedata.disc2 <- distconnected(dist(tsnedata), toolong = toolong, trace = TRUE) # test cat ("threshold dissimilarity", toolong, "\n", max(tsnedata.disc2), " groups","\n") ordiplot(tsnedata) lines(tsne.span2, tsnedata) ordihull(tsnedata, tsnedata.disc2, col="red", lwd=2) # Find groups tsnedata.span2.df <- data.frame(rownames(tbl.sc)) names(tsnedata.span2.df) <- "Gene.Name" tsnedata.span2.df$group <- tsnedata.disc2 tsnedata.span2.list <- dlply(tsnedata.span2.df, .(group)) # GROUP LIST ! tsnedata.span2.df [which(tsnedata.span2.df$Gene.Name=="ALK"),] return(tsnedata.span2.list) } # end make.clusterlist # # for use with data frame row get.slope<- function(cluster.row) { linmod <- lm(1:(ncol(cluster.row)-1)~as.numeric(cluster.row[1,2:ncol(cluster.row)])) slope <- coefficients(linmod)[2] return(slope) } # # different version for sapply get.slope.a <- function(cluster.row) { lrow <- length(cluster.row) if(filled(cluster.row)==0) {slope = 1000} else { linmod <- lm(1:lrow~as.numeric(cluster.row)) slope <- coefficients(linmod)[2] } return(slope) } # # These are a group of 'cull' functions # for more information see: ?subset; RSiteSearch("outliers") # chisq.test requires vectors of the same length, so use slope # Note: use scaled data for best results # cullbyslope <- function(cluster.data.index) { cluster.trim=NULL; cluster.df=NULL; cluster.dftemp=NULL; cluster.mo=NULL cluster.df <- data.frame(cluster.data.index) if (ncol(cluster.df) <= 2) { cat ("\n","This is a single sample cluster!", "\n") return (cluster.df) } else { cluster.m <- data.matrix(cluster.df[,2:ncol(cluster.df)]) rownames(cluster.m) <- cluster.df$Gene.Name cluster.mo <- cluster.m[order(-as.vector(colwise(sum.na)(data.frame(t(cluster.m))))), order(-as.vector(numcolwise(sum.na)(data.frame(cluster.m))))] slope <- apply(cluster.mo, 1, get.slope.a) badslope <- c(names(which(is.na(slope))), names(which(slope > 0))) cluster.trim <- cluster.mo[!(rownames(cluster.mo) %in% badslope),] cluster.trim <- cluster.trim[, which(colSums(cluster.trim, na.rm=TRUE) != 0)] cluster.dftemp <- data.frame(cluster.trim) Gene.Name <- as.vector(rownames(cluster.trim)) cluster.trimdf <- base::cbind(Gene.Name, cluster.dftemp) return(cluster.trimdf) } } # end clullbyslope # This makes sense # cullbottom <- function(cluster.data.index) { cluster.trim=NULL; cluster.df=NULL; cluster.dftemp=NULL; cluster.mo=NULL cluster.df <- data.frame(cluster.data.index) if (ncol(cluster.df) <= 2) { cat ("\n","This is a single sample cluster!", "\n") return (cluster.df) } else { cluster.m <- data.matrix(cluster.df[,2:ncol(cluster.df)]) rownames(cluster.m) <- cluster.df$Gene.Name cluster.mo <- cluster.m[order(-as.vector(colwise(sum.na)(data.frame(t(cluster.m))))), order(-as.vector(numcolwise(sum.na)(data.frame(cluster.m))))] genemeans <- rowMeans(cluster.mo, na.rm=TRUE) genesums <- rowSums(cluster.mo, na.rm=TRUE) meancull <- names(genemeans[which(genemeans < (mean(genemeans) - sd(genemeans)))]) sumcull <- names(genesums[which(genesums < (mean(genesums) - sd(genesums)))]) cluster.trim <- cluster.mo[!(rownames(cluster.mo) %in% unique(c(meancull, sumcull))),] cluster.trim <- cluster.trim[, which(colSums(cluster.trim, na.rm=TRUE) != 0)] cluster.dftemp <- data.frame(cluster.trim) Gene.Name <- as.vector(rownames(cluster.trim)) cluster.trimdf <- base::cbind(Gene.Name, cluster.dftemp) return(cluster.trimdf) } } cullhalf <- function(cluster.data.index) { cluster.trim=NULL; cluster.df=NULL; cluster.dftemp=NULL; cluster.mo=NULL cluster.df <- data.frame(cluster.data.index) if (ncol(cluster.df) <= 2) { cat ("\n","This is a single sample cluster!", "\n") return (cluster.df) } else { cluster.m <- data.matrix(cluster.df[,2:ncol(cluster.df)]) rownames(cluster.m) <- cluster.df$Gene.Name cluster.mo <- cluster.m[order(-as.vector(colwise(sum.na)(data.frame(t(cluster.m))))), order(-as.vector(numcolwise(sum.na)(data.frame(cluster.m))))] genemeans <- rowMeans(cluster.mo, na.rm=TRUE) genesums <- rowSums(cluster.mo, na.rm=TRUE) meancull <- names(genemeans[which(genemeans < mean(genemeans))]) sumcull <- names(genesums[which(genesums < mean(genesums))]) cluster.trim <- cluster.mo[!(rownames(cluster.mo) %in% unique(c(meancull, sumcull))),] cluster.trim <- cluster.trim[, which(colSums(cluster.trim, na.rm=TRUE) != 0)] cluster.dftemp <- data.frame(cluster.trim) Gene.Name <- as.vector(rownames(cluster.trim)) cluster.trimdf <- base::cbind(Gene.Name, cluster.dftemp) return(cluster.trimdf) } } # Hypothesis-driven cull: cull by presence of samples containing a particular gene # cullbygene <- function(cluster.data.index, gene.name) { cluster.trim1=NULL; cluster.df=NULL; cluster.dftemp=NULL; cluster.mo=NULL cluster.df <- data.frame(cluster.data.index) if (ncol(cluster.df) <= 2) { cat ("\n","This is a single sample cluster!", "\n") return (cluster.df) } else { cluster.m <- data.matrix(cluster.df[,2:ncol(cluster.df)]) rownames(cluster.m) <- cluster.df$Gene.Name cluster.mo <- cluster.m[order(-as.vector(colwise(sum.na)(data.frame(t(cluster.m))))), order(-as.vector(numcolwise(sum.na)(data.frame(cluster.m))))] cluster.trim1 <- cluster.mo[, !is.na(cluster.mo[gene.name, ])] cluster.trim2 <- cluster.trim1[which(rowSums(cluster.trim1, na.rm=TRUE) != 0),] cluster.dftemp <- data.frame(cluster.trim2) Gene.Name <- as.vector(rownames(cluster.trim2)) cluster.trimdf <- base::cbind(Gene.Name, cluster.dftemp) return(cluster.trimdf) } } # # This version cuts singlegenesampes too cullssgenes <- function(cluster.data.index) { cluster.trim=NULL; cluster.df=NULL; cluster.dftemp=NULL; cluster.mo=NULL cluster.df <- data.frame(cluster.data.index) if (ncol(cluster.df) <= 2) { cat ("\n","This is a single sample cluster!", "\n") return (cluster.df) } else { cluster.m <- data.matrix(cluster.df[,2:ncol(cluster.df)]) rownames(cluster.m) <- cluster.df$Gene.Name cluster.mo <- cluster.m[order(-as.vector(colwise(sum.na)(data.frame(t(cluster.m))))), order(-as.vector(numcolwise(sum.na)(data.frame(cluster.m))))] singlesamplerows <- which(apply(cluster.mo, 1, filled) == 1) singlegenesamples <- which(apply(t(cluster.mo), 1, filled) == 1) cluster.trim <- cluster.mo[-singlesamplerows, -singlegenesamples] cluster.dftemp <- data.frame(cluster.trim) Gene.Name <- as.vector(rownames(cluster.trim)) cluster.trimdf <- base::cbind(Gene.Name, cluster.dftemp) return(cluster.trimdf) } } # # clust.common <- function(clust.list1, clust.list2) { tt.dd <- matrix(data = NA, nrow = length(clust.list1), ncol = length(clust.list2), byrow = FALSE, dimnames = NULL) for (i in 1:length(clust.list1)) { for (j in 1:length(clust.list2)) { a <- clust.list1[[i]]$Gene.Name b <- clust.list2[[j]]$Gene.Name common <- intersect (a, b) if (length (common) >= 1 ) { tt.dd[i,j] <- length (common) cat(i, ",", j, "\t", common, "\n") if(any(fungenes %in% common)) print (" ^-------NOTE!") } } } return(tt.dd) } # use: t.d <- clust.common(clust.list1, clust.list2) # combine.clusters<- function (clust.data.index.1, clust.data.index.2) { cdf1 <- data.frame(clust.data.index.1) cdf2 <- data.frame(clust.data.index.2) combind <- plyr::join(cdf1, cdf2, by='Gene.Name', type="full") return(combind) } # combine.clusters.sc <- function (clust.data.sc.index.1, clust.data.sc.index.2) { cdf1 <- data.frame(clust.data.sc.index.1) cdf2 <- data.frame(clust.data.sc.index.2) combind1 <- plyr::join(cdf1, cdf2, by='Gene.Name', type="full") combind2 <- cullbyslope(combind1) return(combind2) } # findgene <- function(gene, clusterlist) { cl.df <- ldply(clusterlist) return(cl.df[grepl(gene, cl.df$Gene.Name),]) } # addtolist <- function(clusterlist, newgenevector, newname) { cl.df <- ldply(clusterlist) cl.df <- cl.df[,-1] new.df <- data.frame(newgenevector) names(new.df) <- "Gene.Name" new.df$group <- newname cl.df2 <- rbind(cl.df, new.df) cl2.list <- dlply(cl.df2, .(group)) return(cl2.list) } ######################################################################################## # Functions for RCytoscape plots # To UpDATE # source("http://www.bioconductor.org/biocLite.R") # biocLite("RCytoscape") # ## library(RCytoscape) cy = CytoscapeConnection () # # Function to load the primary cytoscape file # # cytofilename <- readLines(con = stdin(), n = 1) # Test: cytofilename="VignetteNodesCy.txt" # cytoscape.file <- read.table(cytofilename, header=TRUE, sep = "\t", na.strings='', fill=TRUE) # # Function to plot the primary cytoscape file # syntax: > mydata<-plot.cy(cytoscape.file) # plot.cy <-function (cytoscape.file) { mydata <- new("graphNEL", edgemode='directed', nodes=as.character(cytoscape.file[, 1])) # Set up and load all the node attributes for (i in 2:ncol(cytoscape.file)) { mydata <- initNodeAttribute (graph=mydata, attribute.name=names(cytoscape.file[i]), attribute.type='numeric', default.value=0.0) nodeData (mydata, n=as.character(cytoscape.file[, 1]), attr=names(cytoscape.file[i])) <- as.numeric(cytoscape.file[,i]) } # load standard edge attributes mydata <- initEdgeAttribute (graph= mydata, attribute.name='edgeType', attribute.type='char', default.value='undefined') mydata <- initEdgeAttribute(mydata, attribute.name = "Weight", attribute.type = "numeric", default.value = 0.0) return(mydata) } # # load.an.edge <- function (Cy_Wind, Network.File) { filename=NULL a = as.character(Network.File$Gene.1) b = as.character(Network.File $Gene.2) nodenames <- unique(c(a,b)) edgeTypes <- levels(as.factor(Network.File$edgeType)) subgraph2 <- new("graphNEL", nodes= nodenames, edgemode='undirected') subgraph2 <- initEdgeAttribute (graph= subgraph2, attribute.name='edgeType', attribute.type='char', default.value='undefined') subgraph2 <- initEdgeAttribute(subgraph2, attribute.name = "Weight", attribute.type = "numeric", default.value = 0.0) subgraph2 = addEdge (as.vector(Network.File$Gene.1, mode="character"), as.vector(Network.File$Gene.2, mode="character"), subgraph2) edgeData (subgraph2, as.vector(Network.File$Gene.1, mode="character"), as.vector(Network.File$Gene.2, mode="character"), attr='edgeType') <- as.character(Network.File$edgeType) edgeData (subgraph2, as.vector(Network.File$Gene.1, mode="character"), as.vector(Network.File$Gene.2, mode="character"), attr='Weight') <- Network.File$Weight addGraphToGraph (Cy_Wind, subgraph2) redraw(Cy_Wind) showGraphicsDetails(Cy_Wind, TRUE) } ### # loadedges <- function (Cy_Wind, Network.File) { filename=NULL a = as.character(Network.File$Gene.1) b = as.character(Network.File $Gene.2) nodenames <- unique(c(a,b)) edgeTypes <- levels(as.factor(Network.File$edgeType)) sepedges <- dlply(Network.File, .(edgeType)) subgraph2 <- new("graphNEL", nodes= nodenames, edgemode='undirected') subgraph2 <- initEdgeAttribute (graph= subgraph2, attribute.name='edgeType', attribute.type='char', default.value='undefined') subgraph2 <- initEdgeAttribute(subgraph2, attribute.name = "Weight", attribute.type = "numeric", default.value = 0.0) # for(i in 1:length(sepedges)) { filename[i] <- paste(edgeTypes[i], noquote("edges"), sep=" ", collapse=NULL) edgefile <- data.frame(sepedges[i]) names(edgefile) <- names(Network.File) subgraph2 = addEdge (as.vector(edgefile$Gene.1, mode="character"), as.vector(edgefile$Gene.2, mode="character"), subgraph2) edgeData (subgraph2, as.vector(edgefile$Gene.1, mode="character"), as.vector(edgefile$Gene.2, mode="character"), attr='edgeType') <- as.character(edgefile$edgeType) edgeData (subgraph2, as.vector(edgefile$Gene.1, mode="character"), as.vector(edgefile$Gene.2, mode="character"), attr='Weight') <-edgefile$Weight addGraphToGraph (Cy_Wind, subgraph2) redraw(Cy_Wind) cat("\n", filename[i], "\n") } cat("\n", filename) showGraphicsDetails(Cy_Wind, TRUE) } # # plot.Ratio <- function (plotcolnumber) { cydata <- new.CytoscapeWindow (names(cytoscape.file[plotcolnumber]), graph=mydata) setVisualStyle (cydata, "default") node.sizes = c (135, 130, 108, 75, 35, 75, 108, 130, 135) setDefaultNodeShape(cydata, "ellipse") setDefaultNodeColor (cydata, '#C9C9C9') # gray 79 setDefaultNodeSize (cydata, 30) # for grey non-data nodes setDefaultNodeFontSize(cydata, 18) setDefaultNodeBorderWidth (cydata, 1.8) setDefaultNodeBorderColor (cydata, '#888888') # gray setDefaultEdgeLineWidth (cydata, 3) setDefaultEdgeColor (cydata, '#FFFFFF') # white # # RATIO is plotted # Blue is negative: Yellow positive, Green in middle # Note - can plot ratio or ratio.N (normalized) # size.control.points = c (-50.0, -15.0, -5.0, 0.0, 5.0, 15.0, 50.0) color.control.points = c (-50.0, -10.0, -5.0, -2.25, 0.0, 2.25, 5.0, 10.0, 50.0) ratio.colors = c ('0099FF', '#007FFF','#00BFFF', '#00CCFF', '#00FFFF', '#00EE00', '#FFFF7E', '#FFFF00', '#FFE600', '#FFD700', '#FFCC00') displayGraph (cydata) layoutNetwork (cydata, 'grid') setNodeColorRule (cydata, names(cytoscape.file[plotcolnumber]), color.control.points, ratio.colors, mode='interpolate') setNodeSizeRule (cydata, names(cytoscape.file[plotcolnumber]), size.control.points, node.sizes, mode='interpolate') setDefaultNodeSelectionColor (cydata, "#CC00FF") new.style.name = 'ratio.heat.map' return(new.style.name) return(cydata) } # plot.Intensity <- function (plotcolnumber) { cydata <- new.CytoscapeWindow (names(cytoscape.file[plotcolnumber]), graph=mydata) setVisualStyle (cydata, "default") node.sizes = c (135, 130, 108, 75, 35, 75, 108, 130, 135) setDefaultNodeShape(cydata, "ellipse") setDefaultNodeColor (cydata, '#C9C9C9') # gray 79 setDefaultNodeSize (cydata, 30) # for grey non-data nodes setDefaultNodeFontSize(cydata, 18) setDefaultNodeBorderWidth (cydata, 1.8) setDefaultNodeBorderColor (cydata, '#888888') # gray setDefaultEdgeLineWidth (cydata, 3) setDefaultEdgeColor (cydata, '#888888') # gray # # INTENSITY is plotted # For intensity, white in middle # and fraction of max value is biggest # Intensity.Values <- cytoscape.file[plotcolnumber] # set to intensity or normalized intensity maxint <- max(Intensity.Values) minint <- min(Intensity.Values) icolors <- c('0099FF', '#007FFF','#00BFFF', '#00CCFF', '00FFFF', '#FFFFFF', '#FFFF7E', '#FFFF00', 'FFE600', 'FFD700', 'FFCC00') displayGraph (cydata) layoutNetwork (cydata, 'grid') # Some rules to set the color and size depending on the values of intensity if (maxint>abs(minint)) { setNodeColorRule (cydata, names(cytoscape.file[plotcolnumber]), c (-(maxint+1), -(maxint/5), -(maxint/10), -(maxint*0.045), 0.0, (maxint*0.045), (maxint/10), (maxint/5), (maxint+1)), icolors, mode='interpolate') icontrol.points = c (-(maxint+1), -(maxint*0.3), -(maxint/10), 0.0, (maxint/10), (maxint*0.3), (maxint+1)) } if (maxint x namewindow <- paste(x[1], noquote(' String combined score network'), sep="", collapse=NULL) strw <<- new.CytoscapeWindow (namewindow, stgraph) displayGraph (strw) layoutNetwork (strw, 'grid') setDefaultEdgeLineWidth (strw, 3) setDefaultBackgroundColor (strw, 'FFFFFF') # white setDefaultEdgeColor (strw, '#000000') # black setDefaultEdgeSelectionColor(strw, "#88FF00") setDefaultNodeSelectionColor(strw, "#FF3388") edgecolors = c('#008000', '#FF0000', '#0000FF', '#000000') # green; red; blue; (black not used) edgeTypes <- c( 'homology', 'knowledge', 'experimental', 'combined_score') # = String interaction types myarrows <- c ('No Arrow', 'No Arrow', 'No Arrow', 'No Arrow') #setEdgeTargetArrowRule(strw, 'edgeType', edgeTypes, myarrows, default='No Arrow') #setEdgeTargetArrowColorRule(strw, "edgeType", edgeTypes, edgecolors, default.color='#000000') #setEdgeSourceArrowColorRule(strw, "edgeType", edgeTypes, myarrows, default.color='#000000') setEdgeColorRule(strw, 'edgeType', colors=edgecolors, control.points=edgeTypes, mode='lookup', default.color='#888888') # minWeight <- min(stn$combined_score) maxWeight <- max(stn$combined_score) edge.attribute.values <- as.vector(stn$combined_score) weight.scale=scale(edge.attribute.values, scale=T, center=T) weight.scale=as.vector(weight.scale) minScale <- min(weight.scale, na.rm=T) maxScale <- max(weight.scale, na.rm=T) scalefactor = maxScale + abs(minScale) line.widths <- 1.8*scalefactor*scale(weight.scale + (abs(minScale) + 0.25), scale=(maxScale + scalefactor), center=FALSE) line.widths <- as.vector(line.widths) setEdgeLineWidthRule (strw, 'combined_score', as.character(edge.attribute.values), as.character(line.widths)) setLayoutProperties (strw, "kamada-kawai-noweight", list (edge_attribute='combined_score', weight_type=3, min_weight=minWeight, max_weight=maxWeight, layout_passes=50, iterations_pernode=50, distance_strength=8008, rest_length=60, disconnected_strength=0.01, disconnected_rest_length=500, anticollisionStrength=10)) displayGraph (strw) RCytoscape::layoutNetwork (strw, "kamada-kawai-noweight") # StringCS.style.name <- paste(namewindow, noquote('style'), sep=" ", collapse=NULL) copyVisualStyle (strw, 'default', StringCS.style.name) setVisualStyle (strw, StringCS.style.name) } # nodeDprops <- function (windowname) { node.sizes = c (135, 130, 108, 75, 35, 75, 108, 130, 135) setDefaultBackgroundColor (windowname, 'FFFFFF') # white setDefaultNodeShape(windowname, "ellipse") setDefaultNodeColor (windowname, '#C9C9C9') # gray 79 setDefaultNodeSize (windowname, 30) # for grey non-data nodes setDefaultNodeFontSize(windowname, 18) setDefaultNodeBorderWidth (windowname, 1.8) setDefaultNodeBorderColor (windowname, '#888888') # gray setDefaultNodeSelectionColor(windowname, "#FF3388") # setNodeLabelRule(windowname, node.attribute.name) floatPanel (cy, 'Data Panel') } # edgeDprops <- function(windowname) { setDefaultEdgeLineWidth (windowname, 3) setDefaultEdgeColor (windowname, '#888888') # gray setDefaultEdgeSelectionColor(windowname, "#FF6600") edgecolors = c('#FF0000', '#33FFCC', '#008000', '#EE00EE', '#0000FF', '#CC00FF', '#008000', '#006666', '#000000', '#888888', '#00C78C') # red; turquois; green; magenta; blue; violet; green; bluegreen; black; gray; turquoiseblue edgeTypes <- c("pp", 'homology', 'knowledge', 'experimental', 'combined_score', "Physical interactions","Pathway", "Predicted", "Genetic interactions", "merged" , "Shared protein domains") myarrows <- c ('Arrow', 'No Arrow', 'No Arrow', 'No Arrow', 'No Arrow', 'No Arrow', 'No Arrow', 'No Arrow', 'No Arrow', 'No Arrow', 'No Arrow') setEdgeTargetArrowRule(windowname, 'edgeType', edgeTypes, myarrows, default='No Arrow') setEdgeTargetArrowColorRule(windowname, "edgeType", edgeTypes, edgecolors, default.color='#FF0000') setEdgeSourceArrowColorRule(windowname, "edgeType", edgeTypes, myarrows, default.color='#FF0000') setEdgeColorRule(windowname, 'edgeType', edgeTypes, edgecolors, mode='lookup', default.color='#888888') # } # intensityprops <- function (windowname, cytoscape.file) { setVisualStyle (windowname, "default") print (getNodeAttributeNames (windowname)) cat("\n","\n","\t", "Which attribute will set node size and color?") plotcol <- as.character(readLines(con = stdin(), n = 1)) node.sizes = c (135, 130, 108, 75, 35, 75, 108, 130, 135) Intensity.Values <- cytoscape.file[, plotcol] # set to intensity or normalized intensity maxint <- max(Intensity.Values, na.rm=TRUE) minint <- min(Intensity.Values, na.rm=TRUE) icolors <- c('0099FF', '#007FFF','#00BFFF', '#00CCFF', '00FFFF', '#FFFFFF', '#FFFF7E', '#FFFF00', 'FFE600', 'FFD700', 'FFCC00') displayGraph (windowname) # Some rules to set the color and size depending on the values of intensity if (maxint>abs(minint)) { setNodeColorRule (windowname, names(cytoscape.file[plotcol]), c (-(maxint+1), -(maxint/5), -(maxint/10), -(maxint*0.045), 0.0, (maxint*0.045), (maxint/10), (maxint/5), (maxint+1)), icolors, mode='interpolate') icontrol.points = c (-(maxint+1), -(maxint*0.3), -(maxint/10), 0.0, (maxint/10), (maxint*0.3), (maxint+1)) } if (maxint x childname <- paste(x[1], noquote("_PSP Child.."), j, sep="", collapse=NULL) } pspW <- createWindowFromSelection (cywind, childname, TRUE) layoutNetwork (pspW, "kamada-kawai") redraw (pspW) deleteWindow(cy, window.title="PSP kinase-substrate") # if done with this now ########################### keep this as one function for now # get.PSP.child < function (pspsub) { # Get PSP Child network cat("Cytoscape Windows", "\n") getWindowList(cy) pspsub <- existing.CytoscapeWindow(childname , copy=TRUE) # 18 nodes 22 edges pspG <- pspsub@graph slotNames(pspG) eda.names(pspG) psp.edges <- eda(pspG, 'interaction') psp.df <-data.frame(cbind(names(psp.edges), as.vector(psp.edges))) names(psp.df)[1:2] <- c("Edge", "edgeType") setDefaultBackgroundColor (cy, 'FFFFFF') # Now need to split the edge GENE1 | GENE2 format into two columns like GM and String # "|" is a special character that causes problems # p0 <-sapply(psp.df$Edge, stripbar) p1 <-sapply(p0, get.gene.name) p2 <-sapply(p0, get.gene.name.2) psp.df$Gene.1 <-as.vector(p1) psp.df$Gene.2 <-as.vector(p2) # give pp a quanitative value psp.df$Weight <- 0.25 #### an arbitrary value psp.df <- psp.df[,c(3,4,5,2)] return (psp.df) } # sub.network.1 <- function (cy.window, selection) { cat("Cytoscape Windows", "\n") print ( getWindowList(cy)) cat ("\n", "\t", "Which Cytoscape Window contains the selected nodes?", "\n", "\t") cy.window <- readLines(con = stdin(), n = 1) w2 <- existing.CytoscapeWindow (cy.window, copy.graph.from.cytoscape.to.R=TRUE) selection <- getSelectedNodes(w2) coords <- getNodePosition(w2, selection) new.window.title <- paste(cy.window, noquote("Sub-Network"), sep="_", collapse=NULL) c2 <- createWindowFromSelection (w2, new.window.title, TRUE) g2 <- getGraph (c2) x = as.integer (sapply (coords, function (node.loc) return (node.loc$x))) y = as.integer (sapply (coords, function (node.loc) return (node.loc$y))) setNodePosition (c2, selection, x, y) fitContent (c2) showGraphicsDetails (c2, TRUE) } # sub.Network <- function(nodes) { cat("Cytoscape Windows", "\n", "_________________", "\n") cywinds <- getWindowList(cy) cat(cywinds, sep = "\n") cat("\n","\t", "Which Cytoscape window contains the selected nodes? ","\n","\t") cywindname <- as.character(readLines(con = stdin(), n = 1)) w2 <- existing.CytoscapeWindow (cywindname, copy.graph.from.cytoscape.to.R=TRUE) unlist(strsplit(cywindname, "..", fixed = TRUE)) -> x # create an index so that this command can be run multiple times, creating a sub network graph with a new name each time j = 1 childname <- paste(x[1], noquote("_subnetwork..1"), sep="", collapse=NULL) while (childname %in% cywinds) { j <- j + 1 x[2] <- j childname <- paste(x[1], noquote("_subnetwork.."), j, sep="", collapse=NULL) next } # selection <- getSelectedNodes(w2) cat (selection, sep = "\n") cat("\n", "Copy and paste nodes above to retrieve networks from STRING and GeneMANIA as .txt files") cat("\n","\t", "Write a file of node names for website network retrieval?","\n", "(T or F)", "\t") writeresponse <- as.logical(readLines(con = stdin(), n = 1)) if (writeresponse == TRUE) { unlist(strsplit(childname, "_", fixed = TRUE)) -> z namefile <- paste(z[1], j, noquote("_subgenes.txt"), sep="", collapse=NULL) cat("\n", " Use this file:", "\n", "\t", namefile, "\n", "to retrieve networks from STRING and GeneMANIA as .txt files") write.table(sort(selection), file= namefile, sep="\t", eol = "\n", quote = FALSE, row.names=FALSE, col.names=FALSE) } seledges <- getSelectedEdges (w2) coords <- getNodePosition(w2, selection) subW <- createWindowFromSelection (w2, childname, return.graph=TRUE) g2 <- getGraph (subW) x1 = as.integer (sapply (coords, function (node.loc) return (node.loc$x))) y1 = as.integer (sapply (coords, function (node.loc) return (node.loc$y))) # set visual style to the last one created # NOTE: request getVisualStyle = the active style in that window #viz.styles <- getVisualStyleNames(w2) #setVisualStyle (subW, viz.styles[length(viz.styles)]) setNodePosition (subW, selection, x1, y1) fitContent (subW) showGraphicsDetails (subW, TRUE) # subW2 <- existing.CytoscapeWindow (childname, copy.graph.from.cytoscape.to.R=TRUE) return (subW2) } subclone.net <- function(nodes, CytoscapeWindowTitle) { CytoscapeWindow <- existing.CytoscapeWindow (CytoscapeWindowTitle, copy.graph.from.cytoscape.to.R=F) cywinds <- getWindowList(cy) unlist(strsplit(CytoscapeWindowTitle, "..", fixed = TRUE)) -> x # create an index so that this command can be run multiple times, creating a sub network graph with a new name each time j = 1 childname <- paste(x[1], noquote("_subnetwork..1"), sep="", collapse=NULL) while (childname %in% cywinds) { j <- j + 1 x[2] <- j childname <- paste(x[1], noquote("_subnetwork.."), j, sep="", collapse=NULL) next } selection <- as.character(nodes) cat (selection, sep = "\n") cat("\n", "in") cat("\n","\t", CytoscapeWindowTitle, "\n") selectNodes(CytoscapeWindow, selection, preserve.current.selection=FALSE) coords <- getNodePosition(CytoscapeWindow, selection) subW <- createWindowFromSelection (CytoscapeWindow, childname, return.graph=TRUE) g2 <- getGraph (subW) x1 = as.integer (sapply (coords, function (node.loc) return (node.loc$x))) y1 = as.integer (sapply (coords, function (node.loc) return (node.loc$y))) setNodePosition (subW, selection, x1, y1) fitContent (subW) showGraphicsDetails (subW, TRUE) subW2 <- existing.CytoscapeWindow (childname, copy.graph.from.cytoscape.to.R=TRUE) return (subW2) } # ## get.String.edges <- function (stnet) { # use stn <- get.String.edges(stringfile) # String file input and cleanup # - We want the same node IDs cat("Cytoscape Windows", "\n", "_________________", "\n") cywinds <- getWindowList(cy) cat(cywinds, sep = "\n") cat("\n","\t", "Which Cytoscape window contains nodes to plot String edges? ","\n","\t") cywindname <- as.character(readLines(con = stdin(), n = 1)) cw2 <- existing.CytoscapeWindow (cywindname, copy.graph.from.cytoscape.to.R=TRUE) winnodes <- getAllNodes(cw2) cat("\n","\n","\t", "STRING file name?","\n","\t") stringfilename <- readLines(con = stdin(), n = 1) # stnet <- read.table(stringfilename, header=TRUE, sep = "\t", comment.char = "", na.strings='', fill=TRUE) names(stnet)[1]="node1" # to edit out the 'X.' or "#" # fix up the gene names stnet$node1 <- sapply(stnet$node1, get.gene) stnet$node2 <- sapply(stnet$node2, get.gene) # must make these factors to use levels() # stnet$node1=as.factor(stnet$node1) # stnet$node2=as.factor(stnet$node2) #stnodes <- unique(c(levels(stnet[1,1]), levels(stnet[1,2]))) # try this without factors stnodes <- unique(c(as.character(stnet$node1), as.character(stnet$node2))) flub <- setdiff(stnodes, winnodes) # setdiff( winnodes, stnodes) is the non-string node set # if there is a String flub if (length(flub) >= 1) { cat("\n","\t", "The following String names do not match ", cywindname, " nodes:","\n","\t", flub) # Just get rid of the offending nodes if (any(stnet$node1 %in% flub)) stnet <- stnet[-which(stnet$node1 %in% flub), ] if (any(stnet$node2 %in% flub)) stnet <- stnet[-which(stnet$node2 %in% flub), ] } # Rearrange data file stn <- stnet[,c(1:2)] names(stn) <- c('Gene.1', 'Gene.2') stne <- stn stne$Weight <- stnet$experimental stne$edgeType <- "experimental" stnk <- stn stnk$Weight <- stnet$knowledge stnk$edgeType <- "knowledge" stnh <- stn stnh$Weight <- stnet$homology stnh$edgeType <- "homology" stncs <- stn stncs$Weight <- stnet$combined_score stncs$edgeType <- "combined_score" stn <- rbind (stne, stnk, stnh, stncs) nones <- which (stn$Weight==0) stn <- stn[-nones, ] return (stn) } # String.edge.width.props <- function (Cy_Wind, Network.File) { minWeight <- min(Network.File$Weight) maxWeight <- max(Network.File$Weight) edge.attribute.values <- as.vector(Network.File$Weight) weight.scale=scale(edge.attribute.values, scale=T, center=T) weight.scale=as.vector(weight.scale) minScale <- min(weight.scale, na.rm=T) maxScale <- max(weight.scale, na.rm=T) scalefactor = maxScale + abs(minScale) line.widths <- scalefactor*scale(weight.scale + (abs(minScale) + 0.25), scale=(maxScale + scalefactor), center=FALSE) line.widths <- as.vector(line.widths) setVisualStyle(Cy_Wind, "default") # don't want to do this but see below setEdgeLineWidthRule (Cy_Wind, 'Weight', as.character(edge.attribute.values), as.character(line.widths)) # this creates a problem: node size and color are lost; style is set back to 'default' viz.styles <- getVisualStyleNames(Cy_Wind) string.style.name <- paste(viz.styles[length(viz.styles)], noquote(" String"), sep="", collapse=NULL) # this should be the most recently created visual style displayGraph (Cy_Wind) fitContent (Cy_Wind) showGraphicsDetails(Cy_Wind, TRUE) copyVisualStyle (Cy_Wind, 'default', string.style.name) setVisualStyle (Cy_Wind, string.style.name) } # get.GM.edges <- function (gmnet) { # text file from http://www.genemania.org cat("Cytoscape Windows", "\n", "_________________", "\n") cywinds <- getWindowList(cy) cat(cywinds, sep = "\n") cat("\n","\t", "Which Cytoscape window contains nodes to plot GeneMANIA edges? ","\n","\t") cywindname <- as.character(readLines(con = stdin(), n = 1)) cw2 <- existing.CytoscapeWindow (cywindname, copy.graph.from.cytoscape.to.R=TRUE) winnodes <- getAllNodes(cw2) cat("\t", "GeneMANIA file name?","\n","\t") gmfilename <- readLines(con = stdin(), n = 1) # Test gmfilename <- "_tst_genemania_network.txt" gmnet <- read.table(gmfilename, header=TRUE, sep = "\t", comment.char = "#", na.strings='', fill=TRUE) names(gmnet)[1]="Gene.1" # to make sure it's correct gmnet$Gene.1 <- sapply(gmnet$Gene.1, get.gene) gmnet$Gene.2 <- sapply(gmnet$Gene.2, get.gene) # gmnet$Gene.1=as.factor(gmnet$Gene.1) # gmnet$Gene.2=as.factor(gmnet$Gene.2) a = as.character(gmnet$Gene.1) b = as.character(gmnet$Gene.2) gmnodes <- unique(c(a,b)) flub <- setdiff(gmnodes, winnodes) # setdiff( winnodes, stnodes) is the non-GM node set # if there is an ID flub if (length(flub) >= 1) { cat("\n","\t", "The following GM names do not match ", cywindname, " nodes:","\n","\t", flub) # Just get rid of the offending nodes if (any(gmnet$Gene.1 %in% flub)) gmnet <- gmnet[-which(gmnet$Gene.1 %in% flub), ] if (any(gmnet$Gene.2 %in% flub)) gmnet <- gmnet[-which(gmnet$Gene.2 %in% flub), ] } # Prune data file gmn <- gmnet[,c(1:4)] names(gmn)[4] <- "edgeType" return(gmn) } # # GM.edge.width.props <- function (Cy_Wind, Network.File) { minWeight <- min(Network.File$Weight) maxWeight <- max(Network.File$Weight) edge.attribute.values <- as.vector(Network.File$Weight) weight.scale=scale(edge.attribute.values, scale=T, center=T) weight.scale=as.vector(weight.scale) minScale <- min(weight.scale, na.rm=T) maxScale <- max(weight.scale, na.rm=T) scalefactor = maxScale + abs(minScale) line.widths <- 1.8*scalefactor*scale(weight.scale + (abs(minScale) + 0.25), scale=(maxScale + scalefactor), center=FALSE) line.widths <- as.vector(line.widths) setVisualStyle(Cy_Wind, "default") # don't want to do this but see below setEdgeLineWidthRule (Cy_Wind, 'Weight', as.character(edge.attribute.values), as.character(line.widths)) # this creates a problem: node size and color are lost; style is set back to 'default' viz.styles <- getVisualStyleNames(Cy_Wind) GM.style.name <- paste(viz.styles[length(viz.styles)], noquote("_GM"), sep="", collapse=NULL) # this should be the most recently created visual style displayGraph (Cy_Wind) fitContent (Cy_Wind) showGraphicsDetails(Cy_Wind, TRUE) copyVisualStyle (Cy_Wind, 'default', GM.style.name) setVisualStyle (Cy_Wind, GM.style.name) } # # EDGE.MERGE <- function(stn, gmn, PSP.df) { # # We want to sum the evidence for interactions, but not duplicate reversed node edges # st <- stn[stn$edgeType=="combined_score",] # this is all we reallly need for the merge stnodes <- unique(c(as.character(st$Gene.1), as.character(st$Gene.2))) gm <- gmn gmnodes <- unique(c(as.character(gm$Gene.1), as.character(gm$Gene.2))) sgnodes <- unique (c(stnodes, gmnodes)) psp <- PSP.df a = as.character(psp$Gene.1) b = as.character(psp$Gene.2) pnodes <- unique(c(a,b)) comnodes <- sgnodes[sgnodes %in% pnodes] extras <- pnodes[!(pnodes %in% sgnodes)] # = setdiff(pnodes, sgnodes) if (length(extras) >= 1) { # Get rid of the extra nodes psp <- psp[psp$Gene.1 %in% sgnodes, ] psp <- psp[psp$Gene.2 %in% sgnodes, ] # extra1 <- which(psp$Gene.1 %in% extras) # extra2 <- which(psp$Gene.2 %in% extras) #psp <- psp[-extra1,] #psp <- psp[-extra2,] } sgp.c <- rbind(stn, gmn, psp) # can plot this, but it's busy # sgpnodes <- unique(c(levels(sgp.c[1,1]), levels(sgp.c[1,2]))) # # String and GM may have edges in common that are reversed # Therefore, test string edges in reverse # This is easist to do in a single column # st$nodes.combined <- noquote(paste(st$Gene.1, st$Gene.2)) st$nodes.reversed <- noquote(paste(st$Gene.2, st$Gene.1)) gm$nodes.combined <- noquote(paste(gm$Gene.1, gm$Gene.2)) gm$nodes.reversed <- noquote(paste(gm$Gene.2, gm$Gene.1)) psp$nodes.combined <- noquote(paste(psp$Gene.1, psp$Gene.2)) # # Which edges in stn are the reverse of those in gmn? = stn[sg.reverse,] sg.reverse <- st$nodes.combined %in% gm$nodes.reversed # Reverse these # Note that the matrix conversion is reqired because of factor level collision st.fix <- noquote(as.matrix(st[sg.reverse, c(2,1)])) if (nrow(st.fix) >0) { names(st.fix) <- c("Gene.1", "Gene.2") stnt=noquote(as.matrix(st)) stnt[sg.reverse, c(1:2)] <- st.fix st[,c(1:2)]<-stnt[,c(1:2)] } # Revise the test columns st$nodes.combined <- noquote(paste(st$Gene.1, st$Gene.2)) st$nodes.reversed <- noquote(paste(st$Gene.2, st$Gene.1)) # # So, the merge should be : sg <- rbind(st, gm) # # now, PSP edges # kinases are directional, so we only want psp forward edges # # which psp edges match the reverse of sg? psf <- sg$nodes.reversed %in% psp$nodes.combined # a few do = sg[psf,] # Now reverse these sg.pspr <- sg[psf, c(2,1)] names(sg.pspr) <- c("Gene.1", "Gene.2") sg[psf, c(1:2)] <- sg.pspr sg.ord <- order(as.character(sg$Gene.1), as.character(sg$Gene.2)) sg <-sg[sg.ord,] # This sets all kinases as Gene.1 and substrates as Gene.2 # Now bind them all together sgp <- rbind(sg[,c(1:4)], psp[,c(1:4)]) sgp.ord <- order(as.character(sgp$Gene.1), as.character(sgp$Gene.2)) sgp <-sgp[sgp.ord,] # Now merge edges # sgp.merged <- ddply(sgp, .(Gene.1, Gene.2), numcolwise(sum), na.rm=TRUE) # just the gene names and numerical values # test sgp.c.merged <- ddply(sgp.c, .(Gene.1, Gene.2), numcolwise(sum), na.rm=TRUE) # sgp.c.merged is bigger, which means that edges are alligned in sgp sgp.merged$edgeType <- 'merged' # Remove auto-phosphorylation loops auto <- which (as.character(sgp.merged$Gene.1) == as.character(sgp.merged$Gene.2)) if (length(auto) > 0) { sgp.mt <- sgp.merged[-auto,] } else sgp.mt <- sgp.merged return (sgp.mt) } # # From Paul Shannon: selectNodesConnectedBySelectedEdges = function (cw) { selectedEdges = getSelectedEdges (cw) if (length (selectedEdges) == 1 && is.na (selectedEdges)) return () tokens = unlist (strsplit (selectedEdges, ' ')) node.names = intersect (tokens, nodes (cw@graph)) if (length (node.names) > 0) selectNodes (cw, node.names) } # get.adjacent.edge.names = function (cw, geneIDs) { all.edge.names = cy2.edge.names (cw@graph) all.edge.names.cyStyle = as.character (all.edge.names) indices.of.edges.with.geneIDs = c () for (geneID in geneIDs) { geneID.regex.nodeA = sprintf ('^%s ', geneID) geneID.regex.nodeB = sprintf (' %s$ ', geneID) indices.A = grep (geneID.regex.nodeA, all.edge.names.cyStyle) indices.B = grep (geneID.regex.nodeB, all.edge.names.cyStyle) indices.of.edges.with.geneIDs = c (indices.of.edges.with.geneIDs, indices.A, indices.B) } # for geneID return (as.character (all.edge.names) [indices.of.edges.with.geneIDs]) } # get.edge.names # # get.String.edgefile <- function (stringedgefile, nodenames) { nodenames <- as.character(nodenames) stnet <- read.table(stringedgefile, header=TRUE, sep = "\t", comment.char = "", na.strings='', fill=TRUE) names(stnet)[1]="node1" # to edit out the 'X.' or "#" # fix up the gene names stnet$node1 <- sapply(stnet$node1, get.gene) stnet$node2 <- sapply(stnet$node2, get.gene) stnodes <- unique(c(as.character(stnet$node1), as.character(stnet$node2))) flub <- setdiff(stnodes, nodenames) # setdiff( nodenames, stnodes) is the non-string node set # if there is a String flub if (length(flub) >= 1) { cat("\n","\t", "The following String names do not match ","\n","\t", flub) # Just get rid of the offending nodes if (any(stnet$node1 %in% flub)) stnet <- stnet[-which(stnet$node1 %in% flub), ] if (any(stnet$node2 %in% flub)) stnet <- stnet[-which(stnet$node2 %in% flub), ] } # Rearrange data file stn <- stnet[,c(1:2)] names(stn) <- c('Gene.1', 'Gene.2') stne <- stn stne$Weight <- stnet$experimental stne$edgeType <- "experimental" stnk <- stn stnk$Weight <- stnet$knowledge stnk$edgeType <- "knowledge" stnh <- stn stnh$Weight <- stnet$homology stnh$edgeType <- "homology" stncs <- stn stncs$Weight <- stnet$combined_score stncs$edgeType <- "combined_score" stn <- rbind (stne, stnk, stnh, stncs) nones <- which (stn$Weight==0) stn <- stn[-nones, ] return (stn) } get.GM.edgefile <- function (gmfilename, nodenames) { # text file from http://www.genemania.org nodenames <- as.character(nodenames) gmnet <- read.table(gmfilename, header=TRUE, sep = "\t", comment.char = "#", na.strings='', fill=TRUE) names(gmnet)[1]="Gene.1" # to make sure it's correct gmnet$Gene.1 <- sapply(gmnet$Gene.1, get.gene) gmnet$Gene.2 <- sapply(gmnet$Gene.2, get.gene) # gmnet$Gene.1=as.factor(gmnet$Gene.1) # gmnet$Gene.2=as.factor(gmnet$Gene.2) a = as.character(gmnet$Gene.1) b = as.character(gmnet$Gene.2) gmnodes <- unique(c(a,b)) flub <- setdiff(gmnodes, nodenames) # setdiff( nodenames, stnodes) is the non-GM node set # if there is an ID flub if (length(flub) >= 1) { cat("\n","\t", "The following GM names do not match ","\n","\t", flub) # Just get rid of the offending nodes if (any(gmnet$Gene.1 %in% flub)) gmnet <- gmnet[-which(gmnet$Gene.1 %in% flub), ] if (any(gmnet$Gene.2 %in% flub)) gmnet <- gmnet[-which(gmnet$Gene.2 %in% flub), ] } # Prune data file gmn <- gmnet[,c(1:4)] names(gmn)[4] <- "edgeType" return(gmn) } # combine.edges <- function(edgefile1, edgefile2) { # Two network files may have edges in common that are reversed # Therefore, test edges in reverse # This is easist to do in a single column edgefile1$nodes.combined <- noquote(paste(edgefile1$Gene.1, edgefile1$Gene.2)) edgefile1$nodes.reversed <- noquote(paste(edgefile1$Gene.2, edgefile1$Gene.1)) edgefile2$nodes.combined <- noquote(paste(edgefile2$Gene.1, edgefile2$Gene.2)) edgefile2$nodes.reversed <- noquote(paste(edgefile2$Gene.2, edgefile2$Gene.1)) # Which edges are the reverse? = stn[sg.reverse,] reversed <- edgefile2$nodes.combined %in% edgefile1$nodes.reversed # Reverse these # Note that the matrix conversion is reqired because of factor level collision fix2 <- as.matrix(noquote(as.matrix(edgefile2[reversed, c(2,1)]))) if (nrow(fix2) >0) { names(fix2) <- c("Gene.1", "Gene.2") fixededge2=noquote(as.matrix(edgefile2)) fixededge2[reversed, c(1:2)] <- fix2 edgefile2[,c(1:2)] <- fixededge2[,c(1:2)] } # Revise the test columns edgefile2$nodes.combined <- noquote(paste(edgefile2$Gene.1, edgefile2$Gene.2)) edgefile2$nodes.reversed <- noquote(paste(edgefile2$Gene.2, edgefile2$Gene.1)) # # merge : combined <- rbind(edgefile1, edgefile2) combined <- combined[order(combined$Weight, decreasing=TRUE),] combined <- combined[!duplicated(combined[,c(1,2,4)]), ] return(combined[,c(1:4)]) } # END