Analysis Sections

Viewing is better if Code is hidden (Top Right drop down list)

sink(file="RsessionInfoDESeq2.txt")
library('DESeq2')
library("ggplot2")
library(reshape2)
####library(tidyverse)
####library(splitstackshape)
####library(data.table)
library("RColorBrewer")
library("gplots")
####library('ggdendro')
library('ggrepel')
library("dplyr")
library("ComplexHeatmap")
library("clusterProfiler")
library(VennDiagram) ######
library(UpSetR)
library(gridExtra)
library(cluster)
library(circlize)
library(factoextra)
library(NbClust)
library("biomaRt")
library("org.Hs.eg.db")####human
library("org.Mm.eg.db")####mouse
library(venn)
####library(org.At.tair.db)####arabidopsis
sessionInfo()
sink()
#########################################
####multiplot
#########################################
#### Multiple plot function
####
#### ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
#### - cols:   Number of columns in layout
#### - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#### If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
#### then plot 1 will go in the upper left, 2 will go in the upper right, and
#### 3 will go all the way across the bottom.
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  #### Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  #### If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    #### Make the panel
    #### ncol: Number of columns of plots
    #### nrow: Number of rows needed, calculated from #### of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    #### Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    #### Make each plot, in the correct location
    for (i in 1:numPlots) {
      #### Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}
####function my code edit of plotPCA
####################################
plotPCALeo<-function (x, intgroup = "Treatment", ntop = 500, returnData = FALSE, PCx=1, PCy=2)
{
  ####rv <- rowVars(assay(x))
  rv = apply((assay(x)), 1, var)
  select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, 
                                                     length(rv)))]
  pca <- prcomp(t(assay(x)[select, ]))
  percentVar <- pca$sdev^2/sum(pca$sdev^2)
  if (!all(intgroup %in% names(colData(x)))) {
    stop("the argument 'intgroup' should specify columns of colData(dds)")
  }
  intgroup.df <- as.data.frame(colData(x)[, intgroup, drop = FALSE])
  group <- factor(apply(intgroup.df, 1, paste, collapse = " : "))
  d <- data.frame(PCX = pca$x[, PCx], PCY = pca$x[, PCy], group = group, 
                  intgroup.df, names = colnames(x))
  if (returnData) {
    attr(d, "percentVar") <- percentVar[PCx:PCy]
    return(d)
  }
  ggplot(data = d, aes_string(x = "PCX", y = "PCY", color = "group")) + 
    ####ggplot(data = d, aes_string(x = "PCX", y = "PCY", color=Tgfb1, shape=Treatment)) + 
    geom_point(size = 3) + xlab(paste0("PC",PCx,": ", round(percentVar[1] * 
                                                              100), "% variance")) + ylab(paste0("PC",PCy,": ", round(percentVar[2] * 
                                                                                                                        100), "% variance"))
}
col_fun = colorRamp2(c(-1,-0.2, 0,0.2, 1), c("blue","cyan", "grey90","orange", "red"))#heatmap colours
col_funGR = colorRamp2(c(-1.5, 0, 1.5), c("green", "black", "red"))
col_funGR2 = colorRamp2(c(-2, 0, 2), c("green", "black", "red"))
colorsV3 <- c("cornflowerblue",  "brown1","orange2")#Venn colours
colorsV2 <- c("mediumorchid1",  "chartreuse3")#Venn colours
colorsV4<-c("cornflowerblue", "orange2", "green3","red")#Venn colours
colorsV5<-c("cornflowerblue", "orange2", "green3","purple","red")#Venn colours
#col_fun(seq(-3, 3))

R3 VAR14 vs RBC no TNF k-means q0.05

1. Genelist Selection

groupsName<-"R3_VAR14_kmeans_q0.05"
countsTable<-read.delim("RNAseq2019July_5.txt", header = TRUE, sep = "\t",check.names=FALSE,row.names=1)
head(countsTable)
AllGeneNames<-countsTable$Gene_Symbol
#head(AllGeneNames)
tempA<-countsTable
topDEgenes <- which(tempA$padj_R3noTNF_var14_vs_RBC_0h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_0h))####find indexes 
listA<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R3noTNF_var14_vs_RBC_2h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_2h))####find indexes 
listB<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R3noTNF_var14_vs_RBC_6h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_6h))####find indexes 
listC<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R3noTNF_var14_vs_RBC_20h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_20h))####find indexes 
listD<-tempA[ topDEgenes, ]$Gene_Symbol
vennq<-venn.diagram(x = list(listA,listB,listC,listD) ,
            category.names = c("Var14noTNF_0h","Var14noTNF_2h","Var14noTNF_6h","Var14noTNF_20h"),
            main="padj<0.05",
            filename = NULL,  scaled = FALSE, fill = colorsV4, cat.col = colorsV4, cat.cex = 1, cat.dist=0.3,  margin = 0.3)
topDEgenes <- which(tempA$pvalue_R3noTNF_var14_vs_RBC_0h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_0h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_0h))####find indexes 
listA<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$pvalue_R3noTNF_var14_vs_RBC_2h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_2h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_2h))####find indexes 
listB<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$pvalue_R3noTNF_var14_vs_RBC_6h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_6h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_6h))####find indexes 
listC<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$pvalue_R3noTNF_var14_vs_RBC_20h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_20h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_20h))####find indexes 
listD<-tempA[ topDEgenes, ]$Gene_Symbol
vennp<-venn.diagram(x = list(listA,listB,listC,listD) ,
            category.names = c("Var14noTNF_0h","Var14noTNF_2h","Var14noTNF_6h","Var14noTNF_20h"),
            main="pvalue<0.05&fold change>2",
            filename = NULL,  scaled = FALSE, fill = colorsV4, cat.col = colorsV4, cat.cex = 1, cat.dist=0.3,  margin = 0.3)
topDEgenes <- which(tempA$padj_R3noTNF_var14_vs_RBC_0h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_0h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_0h))####find indexes 
listA<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R3noTNF_var14_vs_RBC_2h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_2h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_2h))####find indexes 
listB<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R3noTNF_var14_vs_RBC_6h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_6h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_6h))####find indexes 
listC<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R3noTNF_var14_vs_RBC_20h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_20h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_20h))####find indexes 
listD<-tempA[ topDEgenes, ]$Gene_Symbol
vennq2<-venn.diagram(x = list(listA,listB,listC,listD) ,
            category.names = c("Var14noTNF_0h","Var14noTNF_2h","Var14noTNF_6h","Var14noTNF_20h"),
            main="padj0.1&fold change>2",
            filename = NULL,  scaled = FALSE, fill = colorsV4, cat.col = colorsV4, cat.cex = 1, cat.dist=0.3,  margin = 0.3)
topDEgenes <- which((tempA$padj_R3noTNF_var14_vs_RBC_0h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_0h))| 
(tempA$padj_R3noTNF_var14_vs_RBC_2h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_2h))|
(tempA$padj_R3noTNF_var14_vs_RBC_6h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_6h))| 
(tempA$padj_R3noTNF_var14_vs_RBC_20h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_20h)) 
)
listA<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which((tempA$pvalue_R3noTNF_var14_vs_RBC_0h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_0h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_0h))| 
(tempA$pvalue_R3noTNF_var14_vs_RBC_2h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_2h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_2h))| 
(tempA$pvalue_R3noTNF_var14_vs_RBC_6h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_6h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_6h))| 
(tempA$pvalue_R3noTNF_var14_vs_RBC_20h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_20h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_20h))
 )####find indexes 
listC<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which((tempA$padj_R3noTNF_var14_vs_RBC_0h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_0h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_0h))| 
(tempA$padj_R3noTNF_var14_vs_RBC_2h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_2h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_2h))| 
(tempA$padj_R3noTNF_var14_vs_RBC_6h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_6h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_6h))| 
(tempA$padj_R3noTNF_var14_vs_RBC_20h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_20h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_20h))
 )####find indexes 
listB<-tempA[ topDEgenes, ]$Gene_Symbol
vennpq<-venn.diagram(x = list(listA,listB,listC) ,
            category.names = c("padj<0.05","padj<0.1&fc>2","p<0.05&fc>2"),
            main="padj compared to pvalue",
            filename = NULL,  scaled = FALSE, fill = colorsV3, cat.col = colorsV3, cat.cex = 1, cat.dist=0.1,  margin = 0.15)
grid.arrange(gTree(children=vennq), gTree(children=vennpq), ncol=2,top="R2 Var14 no TNF")

#tempA<-resAll[-c(10:30) ]
tempA<-countsTable
#rownames(tempA)
rownames(tempA) <- NULL
tempA = mutate(tempA, Include=
                   ifelse(tempA$padj_R3noTNF_var14_vs_RBC_0h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_0h), "in",
                          ifelse(tempA$padj_R3noTNF_var14_vs_RBC_2h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_2h), "in",
                                 ifelse(tempA$padj_R3noTNF_var14_vs_RBC_6h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_6h), "in",
                                        ifelse(tempA$padj_R3noTNF_var14_vs_RBC_20h<0.05&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_20h), "in",
                                                 "out")))))
#tempA
####library(dplyr)
tempA %>%
     group_by(Include) %>% 
     tally()
topDEgenes <- which(tempA$Include=="in")####find indexes 
head(countsTable)

NB Please check columns used and renamed for plots

#baseMeansHm <-countsTable[,c(60:63)]
baseMeansHm <-countsTable[,c(60:63,79:82)]
head(baseMeansHm)
baseMeansHm2 <- log2(baseMeansHm+1)
baseMeansHm2$Var14_RBC_0h<-baseMeansHm2$Var14noTNF_0h_mean-baseMeansHm2$RBCnoTNF_0h_mean
baseMeansHm2$Var14_RBC_2h<-baseMeansHm2$Var14noTNF_2h_mean-baseMeansHm2$RBCnoTNF_2h_mean
baseMeansHm2$Var14_RBC_6h<-baseMeansHm2$Var14noTNF_6h_mean-baseMeansHm2$RBCnoTNF_6h_mean
baseMeansHm2$Var14_RBC_20h<-baseMeansHm2$Var14noTNF_20h_mean-baseMeansHm2$RBCnoTNF_20h_mean
baseMeansHm <-baseMeansHm2[,c(9:12)]
head(baseMeansHm)
baseMeansHmM <-baseMeansHm2[,c(1:8)]
head(baseMeansHmM)
topDEgenes <- which(tempA$Include=="in")####find indexes 

2. Hierachical clustering of means (individual samples added for inspection)

####mean logfc
dataHMm<-baseMeansHm[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm<- t(as.matrix(dataHMm))
dataHMm <- t(scale(dataHMm))
hmap_hier_factors4 <- Heatmap(
  dataHMm,  name = "logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means logfc"), 
  col = col_funGR,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
dataHMmPlot<-as.data.frame(dataHMm)
dataHMmPlot$Var14_RBC_2h<-dataHMmPlot$Var14_RBC_2h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_6h<-dataHMmPlot$Var14_RBC_6h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_20h<-dataHMmPlot$Var14_RBC_20h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_0h<-dataHMmPlot$Var14_RBC_0h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot<-as.matrix(dataHMmPlot)
  
hmap_hier_factors6 <- Heatmap(
  dataHMmPlot,  name = "Normalised logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("0h Normalised logfc"), 
  col = col_funGR2,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
####means
dataHMm3<-baseMeansHmM[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm3<- t(as.matrix(dataHMm3))
dataHMm3 <- t(scale(dataHMm3))
dataHMm3<-dataHMm3[,c(5,1,6,2,7,3,8,4)]
hmap_hier_factors5 <- Heatmap(
  dataHMm3,  name = "Expression",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means"), 
  col = col_fun,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(100, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
  
hmap_hier_factors4+hmap_hier_factors6+hmap_hier_factors5

par(mfrow=c(1,2))
#### Silhouette method
fviz_nbclust(dataHMm, kmeans, method = "silhouette",k.max = 16)+
  labs(subtitle = "Silhouette method")

#### Elbow method
fviz_nbclust(dataHMm, kmeans, method = "wss",k.max = 16) +
  labs(subtitle = "Elbow method")

####gap stat slow!!!
####set.seed(123)
####fviz_nbclust(dataHMm, kmeans, nstart = 25,  method = "gap_stat", nboot = 100,k.max = 16)+
####  labs(subtitle = "Gap statistic method")
kclust3 <- kmeans(dataHMm, 7)
#silhouette plot
distK<-daisy(dataHMm)
plot(silhouette(kclust3$cluster, distK), col=1:7, border=NA)

3. K-means clustering of means

split <- paste0("Cluster\n", kclust3$cluster)
#split <- factor(paste0("Cluster\n", kclust3$cluster), levels=c("Cluster\n3","Cluster\n1","Cluster\n4","Cluster\n5","Cluster\n2","Cluster\n6"))
hmap_k <- Heatmap(dataHMm, split=split, cluster_row_slices = FALSE,
                  cluster_columns = FALSE,
                  show_row_names = FALSE,
                  name = "logfc",
                  col = col_funGR,
                  width = unit(50, "mm"),
                  column_title = "means logfc", 
                  column_title_gp = gpar(fontsize = 16, fontface = "bold"))
hmap_k+hmap_hier_factors6+hmap_hier_factors5

Mean profiles of clusters

clustercount<-data.frame(kclust3$cluster)
clustersizes<-table(clustercount$kclust3.cluster)
clusterMeans<-data.frame(kclust3$centers)
clusterMeans1<-data.frame(t(clusterMeans))
clusterMeans1 <- cbind(rownames(clusterMeans1), clusterMeans1)
orderN<-c("Var14_RBC_0h","Var14_RBC_2h","Var14_RBC_6h","Var14_RBC_20h")#### manual
rownames(clusterMeans1) <- NULL
names(clusterMeans1)[names(clusterMeans1)=="rownames(clusterMeans1)"] <- "Sample"
####clusterMeans1
pX1<-ggplot(data=clusterMeans1, aes(x=Sample, y=X1,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X1 Profile ",clustersizes[1]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX2<-ggplot(data=clusterMeans1, aes(x=Sample, y=X2,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X2 Profile ",clustersizes[2]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX3<-ggplot(data=clusterMeans1, aes(x=Sample, y=X3,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X3 Profile ",clustersizes[3]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX4<-ggplot(data=clusterMeans1, aes(x=Sample, y=X4,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X4 Profile ",clustersizes[4]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX5<-ggplot(data=clusterMeans1, aes(x=Sample, y=X5,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X5 Profile ",clustersizes[5]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX6<-ggplot(data=clusterMeans1, aes(x=Sample, y=X6,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X6 Profile ",clustersizes[6]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX7<-ggplot(data=clusterMeans1, aes(x=Sample, y=X7,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X7 Profile ",clustersizes[5]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#pX8<-ggplot(data=clusterMeans1, aes(x=Sample, y=X8,group=1)) +
#  geom_line()+  geom_point()+ggtitle(paste("Cluster X8 Profile ",clustersizes[6]," genes"))+  scale_x_discrete(limits=orderN)+
#  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#plot
multiplot(pX1, pX2, pX3, pX4,pX5, pX6,pX7,  cols=2)

topDEgenes <- which(tempA$Include=="in")####find indexes
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
#### export the gene expression data for the clusters
write.table(clusterMeans,paste0("ClusterMeansKm_",groupsName,".txt"),  sep = "\t")
ClusteredGenes<-data.frame(kclust3$cluster,SymbolsKm,dataHMm)
write.table(ClusteredGenes,paste0("ScaledDataInClustersKm_",groupsName,".txt"),  sep = "\t")
#head(ClusteredGenes)
bottomDEgenes<-which(tempA$Include=="out")####find indexes 
bottomG<-tempA[ bottomDEgenes, ]
bottomG<-dplyr::pull(bottomG, Gene_Symbol)
write.table(bottomG,paste0("ipaBottomKmeans_",groupsName,".txt"),  sep = "\t")
                         
topDEgenes <- which(tempA$Include=="in")####find indexes 
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
ipaKmeans<-ClusteredGenes
#countsTable <-countsTable[,c(1:15)]####if samples need removing
ipaKmeans<-ipaKmeans[,c(1:2)]
ipaKmeans$name2<-rownames(ipaKmeans)
#ipaKmeans%>% rownames_to_column(var = "rowname")
#ipaKmeans
#rowid_to_column(ipaKmeans)
ipaKmeans = mutate(ipaKmeans, x1= ifelse(ipaKmeans$kclust3.cluster==1, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x2= ifelse(ipaKmeans$kclust3.cluster==2, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x3= ifelse(ipaKmeans$kclust3.cluster==3, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x4= ifelse(ipaKmeans$kclust3.cluster==4, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x5= ifelse(ipaKmeans$kclust3.cluster==5, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x6= ifelse(ipaKmeans$kclust3.cluster==6, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x7= ifelse(ipaKmeans$kclust3.cluster==7, "1", "0"))
#ipaKmeans
write.table(ipaKmeans,paste0("ipaKmeans_",groupsName,".txt"),  sep = "\t")
#head(ipaKmeans)
ClusteredGenes2<-ClusteredGenes[c(1)]
#ClusteredGenes2
listAll<-list()
for(i in 1:7) {
  clusterName<-paste0("x",i)
  #clusterName<-row.names(subset(ClusteredGenes,ClusteredGenes==i))
  clusterName<-(subset(ClusteredGenes$SymbolsKm,ClusteredGenes==i))
  listAll[[i]]<-clusterName
}
#need to name the vectors in the list, example here is for 8 clusters
names(listAll)<-c("X1", "X2", "X3", "X4","X5", "X6", "X7")
#if you want to rearrange the order
#listAll<-listAll[c("x3", "x7", "x8", "x2", "x6", "x5", "x4", "x1")]
#lapply(listAll, head)

4. Annotation of K-means clusters

  • CC cellular compartment
  • BP biological process
  • MF molecular function

The simplify function has been used to cut down on GO redundancy

#str(AllGeneNames)
####CC
cgoCC <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      ####OrgDb=org.Mm.eg.db,
                      keyType="SYMBOL",
                      ont = "CC", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoCC2 <- simplify(cgoCC, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoCC2),paste0("GO_CC_",groupsName,".csv"))
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Plots and GO data were written to files

png(paste0("GO_CC_",groupsName,".png"), width = 1224, height = 824)
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO BP

####CC
cgoBP <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db,
                      keyType="SYMBOL",
                      ont = "BP", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoBP2 <- simplify(cgoBP, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoBP2),paste0("GO_BP_",groupsName,".csv"))
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_BP_",groupsName,".png"), width = 1024, height = 1224)
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO MF

####MF
cgoMF <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      keyType="SYMBOL",
                      ont = "MF", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoMF2 <- simplify(cgoMF, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoMF2),paste0("GO_MF_",groupsName,".csv"))
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_MF_",groupsName,".png"), width = 1424, height = 824)
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

R3 VAR14 vs RBC no TNF k-means p0.05fc2

1. Genelist Selection

groupsName<-"R3_VAR14_kmeans_p0.05fc2"
countsTable<-read.delim("RNAseq2019July_5.txt", header = TRUE, sep = "\t",check.names=FALSE,row.names=1)
head(countsTable)
AllGeneNames<-countsTable$Gene_Symbol
#head(AllGeneNames)
grid.arrange(gTree(children=vennp), gTree(children=vennpq) , ncol=2,top="R3 Var14 no TNF")

#tempA<-resAll[-c(10:30) ]
tempA<-countsTable
#rownames(tempA)
rownames(tempA) <- NULL
tempA = mutate(tempA, Include=
                   ifelse(tempA$pvalue_R3noTNF_var14_vs_RBC_0h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_0h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_0h), "in",
                          ifelse(tempA$pvalue_R3noTNF_var14_vs_RBC_2h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_2h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_2h), "in",
                                 ifelse(tempA$pvalue_R3noTNF_var14_vs_RBC_6h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_6h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_6h), "in",
                                        ifelse(tempA$pvalue_R3noTNF_var14_vs_RBC_20h<0.05&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_20h)>1&!is.na(tempA$pvalue_R3noTNF_var14_vs_RBC_20h), "in",
                                                                       "out")))))
#tempA
####library(dplyr)
tempA %>%
     group_by(Include) %>% 
     tally()
topDEgenes <- which(tempA$Include=="in")####find indexes 

NB Please check columns used and renamed for plots

baseMeansHm <-countsTable[,c(60:63,79:82)]
head(baseMeansHm)
baseMeansHm2 <- log2(baseMeansHm+1)
baseMeansHm2$Var14_RBC_0h<-baseMeansHm2$Var14noTNF_0h_mean-baseMeansHm2$RBCnoTNF_0h_mean
baseMeansHm2$Var14_RBC_2h<-baseMeansHm2$Var14noTNF_2h_mean-baseMeansHm2$RBCnoTNF_2h_mean
baseMeansHm2$Var14_RBC_6h<-baseMeansHm2$Var14noTNF_6h_mean-baseMeansHm2$RBCnoTNF_6h_mean
baseMeansHm2$Var14_RBC_20h<-baseMeansHm2$Var14noTNF_20h_mean-baseMeansHm2$RBCnoTNF_20h_mean
baseMeansHm <-baseMeansHm2[,c(9:12)]
head(baseMeansHm)
topDEgenes <- which(tempA$Include=="in")####find indexes 

2. Hierachical clustering of means (individual samples added for inspection)

####mean logfc
dataHMm<-baseMeansHm[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm<- t(as.matrix(dataHMm))
dataHMm <- t(scale(dataHMm))
hmap_hier_factors4 <- Heatmap(
  dataHMm,  name = "logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means logfc"), 
  col = col_funGR,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
dataHMmPlot<-as.data.frame(dataHMm)
dataHMmPlot$Var14_RBC_2h<-dataHMmPlot$Var14_RBC_2h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_6h<-dataHMmPlot$Var14_RBC_6h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_20h<-dataHMmPlot$Var14_RBC_20h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_0h<-dataHMmPlot$Var14_RBC_0h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot<-as.matrix(dataHMmPlot)
  
hmap_hier_factors6 <- Heatmap(
  dataHMmPlot,  name = "Normalised logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("0h Normalised logfc"), 
  col = col_funGR2,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
####means
dataHMm3<-baseMeansHmM[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm3<- t(as.matrix(dataHMm3))
dataHMm3 <- t(scale(dataHMm3))
dataHMm3<-dataHMm3[,c(5,1,6,2,7,3,8,4)]
hmap_hier_factors5 <- Heatmap(
  dataHMm3,  name = "Expression",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means"), 
  col = col_fun,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(100, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
  
hmap_hier_factors4+hmap_hier_factors6+hmap_hier_factors5

par(mfrow=c(1,2))
#### Silhouette method
fviz_nbclust(dataHMm, kmeans, method = "silhouette",k.max = 16)+
  labs(subtitle = "Silhouette method")

#### Elbow method
fviz_nbclust(dataHMm, kmeans, method = "wss",k.max = 16) +
  labs(subtitle = "Elbow method")

####gap stat slow!!!
####set.seed(123)
####fviz_nbclust(dataHMm, kmeans, nstart = 25,  method = "gap_stat", nboot = 100,k.max = 16)+
####  labs(subtitle = "Gap statistic method")
#kclust4 <- kmeans(dataHMm, 5)
#silhouette plot
distK<-daisy(dataHMm)
plot(silhouette(kclust4$cluster, distK), col=1:5, border=NA)

3. K-means clustering of means

split <- paste0("Cluster\n", kclust4$cluster)
#split <- factor(paste0("Cluster\n", kclust3$cluster), levels=c("Cluster\n3","Cluster\n1","Cluster\n4","Cluster\n5","Cluster\n2","Cluster\n6"))
hmap_k <- Heatmap(dataHMm, split=split, cluster_row_slices = FALSE,
                  cluster_columns = FALSE,
                  show_row_names = FALSE,
                  name = "logfc",
                  col = col_funGR,
                  width = unit(50, "mm"),
                  column_title = "means logfc", 
                  column_title_gp = gpar(fontsize = 16, fontface = "bold"))
hmap_k+hmap_hier_factors6+hmap_hier_factors5

Mean profiles of clusters

clustercount<-data.frame(kclust4$cluster)
clustersizes<-table(clustercount$kclust4.cluster)
clusterMeans<-data.frame(kclust4$centers)
clusterMeans1<-data.frame(t(clusterMeans))
clusterMeans1 <- cbind(rownames(clusterMeans1), clusterMeans1)
orderN<-c("Var14_RBC_0h","Var14_RBC_2h","Var14_RBC_6h","Var14_RBC_20h")#### manual
rownames(clusterMeans1) <- NULL
names(clusterMeans1)[names(clusterMeans1)=="rownames(clusterMeans1)"] <- "Sample"
####clusterMeans1
pX1<-ggplot(data=clusterMeans1, aes(x=Sample, y=X1,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X1 Profile ",clustersizes[1]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX2<-ggplot(data=clusterMeans1, aes(x=Sample, y=X2,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X2 Profile ",clustersizes[2]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX3<-ggplot(data=clusterMeans1, aes(x=Sample, y=X3,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X3 Profile ",clustersizes[3]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX4<-ggplot(data=clusterMeans1, aes(x=Sample, y=X4,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X4 Profile ",clustersizes[4]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX5<-ggplot(data=clusterMeans1, aes(x=Sample, y=X5,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X5 Profile ",clustersizes[5]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#pX6<-ggplot(data=clusterMeans1, aes(x=Sample, y=X6,group=1)) +
#  geom_line()+  geom_point()+ggtitle(paste("Cluster X6 Profile ",clustersizes[6]," genes"))+  scale_x_discrete(limits=orderN)+
#  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#plot
multiplot(pX1, pX2, pX3,pX4, pX5,  cols=2)

topDEgenes <- which(tempA$Include=="in")####find indexes
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
#### export the gene expression data for the clusters
write.table(clusterMeans,paste0("ClusterMeansKm_",groupsName,".txt"),  sep = "\t")
ClusteredGenes<-data.frame(kclust4$cluster,SymbolsKm,dataHMm)
write.table(ClusteredGenes,paste0("ScaledDataInClustersKm_",groupsName,".txt"),  sep = "\t")
#head(ClusteredGenes)
bottomDEgenes<-which(tempA$Include=="out")####find indexes 
bottomG<-tempA[ bottomDEgenes, ]
bottomG<-dplyr::pull(bottomG, Gene_Symbol)
write.table(bottomG,paste0("ipaBottomKmeans_",groupsName,".txt"),  sep = "\t")
                         
topDEgenes <- which(tempA$Include=="in")####find indexes 
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
ipaKmeans<-ClusteredGenes
#countsTable <-countsTable[,c(1:15)]####if samples need removing
ipaKmeans<-ipaKmeans[,c(1:2)]
ipaKmeans$name2<-rownames(ipaKmeans)
#ipaKmeans%>% rownames_to_column(var = "rowname")
#ipaKmeans
#rowid_to_column(ipaKmeans)
ipaKmeans = mutate(ipaKmeans, x1= ifelse(ipaKmeans$kclust4.cluster==1, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x2= ifelse(ipaKmeans$kclust4.cluster==2, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x3= ifelse(ipaKmeans$kclust4.cluster==3, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x4= ifelse(ipaKmeans$kclust4.cluster==4, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x5= ifelse(ipaKmeans$kclust4.cluster==5, "1", "0"))
#ipaKmeans = mutate(ipaKmeans, x6= ifelse(ipaKmeans$kclust4.cluster==6, "1", "0"))
#ipaKmeans
write.table(ipaKmeans,paste0("ipaKmeans_",groupsName,".txt"),  sep = "\t")
#head(ipaKmeans)
ClusteredGenes2<-ClusteredGenes[c(1)]
#ClusteredGenes2
listAll<-list()
for(i in 1:5) {
  clusterName<-paste0("x",i)
  #clusterName<-row.names(subset(ClusteredGenes,ClusteredGenes==i))
  clusterName<-(subset(ClusteredGenes$SymbolsKm,ClusteredGenes==i))
  listAll[[i]]<-clusterName
}
#need to name the vectors in the list, example here is for 8 clusters
names(listAll)<-c("X1", "X2", "X3","X4", "X5")
#if you want to rearrange the order
#listAll<-listAll[c("x3", "x7", "x8", "x2", "x6", "x5", "x4", "x1")]
lapply(listAll, head)
$X1
[1] "CDH11"  "KCTD12" "DNAJB4" "NFKBIZ" "SAMD9"  "PMAIP1"

$X2
[1] "ABCB1"     "CLEC10A"   "PCDH17"    "RPS7P11"   "FTH1P23"   "EEF1A1P12"

$X3
[1] "PSMD6-AS2" "RGS7BP"    "SNHG26"    "NCR3LG1"   "ANGPTL4"   "KRT7"     

$X4
[1] "CYP1A1" "TXNIP"  "TTC39A" "CCDC68" "KLF4"   "FOSB"  

$X5
[1] "F2RL3"      "FCN3"       "B9D2"       "RAB11FIP1"  "AC139530.1" "PRR29"     

4. Annotation of K-means clusters

  • CC cellular compartment
  • BP biological process
  • MF molecular function

The simplify function has been used to cut down on GO redundancy

#str(AllGeneNames)
####CC
cgoCC <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      ####OrgDb=org.Mm.eg.db,
                      keyType="SYMBOL",
                      ont = "CC", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoCC2 <- simplify(cgoCC, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoCC2),paste0("GO_CC_",groupsName,".csv"))

dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Plots and GO data were written to files

png(paste0("GO_CC_",groupsName,".png"), width = 1224, height = 824)
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()

GO BP

####CC
cgoBP <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db,
                      keyType="SYMBOL",
                      ont = "BP", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoBP2 <- simplify(cgoBP, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoBP2),paste0("GO_BP_",groupsName,".csv"))
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_BP_",groupsName,".png"), width = 1024, height = 1224)
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO MF

####MF
cgoMF <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      keyType="SYMBOL",
                      ont = "MF", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoMF2 <- simplify(cgoMF, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoMF2),paste0("GO_MF_",groupsName,".csv"))

dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
png(paste0("GO_MF_",groupsName,".png"), width = 1424, height = 824)
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()

R3 VAR14 vs RBC no TNF k-means padj0.1fc2

1. Genelist Selection

groupsName<-"R3_VAR14_kmeans_padj0.1fc2"
countsTable<-read.delim("RNAseq2019July_5.txt", header = TRUE, sep = "\t",check.names=FALSE,row.names=1)
head(countsTable)
AllGeneNames<-countsTable$Gene_Symbol
#head(AllGeneNames)
grid.arrange(gTree(children=vennp), gTree(children=vennpq) , ncol=2,top="R3 Var14 no TNF")

#tempA<-resAll[-c(10:30) ]
tempA<-countsTable
#rownames(tempA)
rownames(tempA) <- NULL
tempA = mutate(tempA, Include=
                   ifelse(tempA$padj_R3noTNF_var14_vs_RBC_0h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_0h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_0h), "in",
                          ifelse(tempA$padj_R3noTNF_var14_vs_RBC_2h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_2h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_2h), "in",
                                 ifelse(tempA$padj_R3noTNF_var14_vs_RBC_6h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_6h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_6h), "in",
                                        ifelse(tempA$padj_R3noTNF_var14_vs_RBC_20h<0.1&abs(tempA$log2FoldChange_R3noTNF_var14_vs_RBC_20h)>1&!is.na(tempA$padj_R3noTNF_var14_vs_RBC_20h), "in",
                                                                       "out")))))
#tempA
####library(dplyr)
tempA %>%
     group_by(Include) %>% 
     tally()
topDEgenes <- which(tempA$Include=="in")####find indexes 

NB Please check columns used and renamed for plots

baseMeansHm <-countsTable[,c(60:63,79:82)]
baseMeansHm2 <- log2(baseMeansHm+1)
baseMeansHm2$Var14_RBC_0h<-baseMeansHm2$Var14noTNF_0h_mean-baseMeansHm2$RBCnoTNF_0h_mean
baseMeansHm2$Var14_RBC_2h<-baseMeansHm2$Var14noTNF_2h_mean-baseMeansHm2$RBCnoTNF_2h_mean
baseMeansHm2$Var14_RBC_6h<-baseMeansHm2$Var14noTNF_6h_mean-baseMeansHm2$RBCnoTNF_6h_mean
baseMeansHm2$Var14_RBC_20h<-baseMeansHm2$Var14noTNF_20h_mean-baseMeansHm2$RBCnoTNF_20h_mean
baseMeansHm <-baseMeansHm2[,c(9:12)]
head(baseMeansHm)
baseMeansHmM <-baseMeansHm2[,c(1:8)]
head(baseMeansHmM)
topDEgenes <- which(tempA$Include=="in")####find indexes 

2. Hierachical clustering of means (individual samples added for inspection)

####mean logfc
dataHMm<-baseMeansHm[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm<- t(as.matrix(dataHMm))
dataHMm <- t(scale(dataHMm))
hmap_hier_factors4 <- Heatmap(
  dataHMm,  name = "logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means logfc"), 
  col = col_funGR,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
dataHMmPlot<-as.data.frame(dataHMm)
dataHMmPlot$Var14_RBC_2h<-dataHMmPlot$Var14_RBC_2h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_6h<-dataHMmPlot$Var14_RBC_6h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_20h<-dataHMmPlot$Var14_RBC_20h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_0h<-dataHMmPlot$Var14_RBC_0h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot<-as.matrix(dataHMmPlot)
  
hmap_hier_factors6 <- Heatmap(
  dataHMmPlot,  name = "Normalised logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("0h Normalised logfc"), 
  col = col_funGR2,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
####means
dataHMm3<-baseMeansHmM[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm3<- t(as.matrix(dataHMm3))
dataHMm3 <- t(scale(dataHMm3))
dataHMm3<-dataHMm3[,c(5,1,6,2,7,3,8,4)]
hmap_hier_factors5 <- Heatmap(
  dataHMm3,  name = "Expression",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means"), 
  col = col_fun,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(100, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
  
hmap_hier_factors4+hmap_hier_factors6+hmap_hier_factors5

par(mfrow=c(1,2))
#### Silhouette method
fviz_nbclust(dataHMm, kmeans, method = "silhouette",k.max = 16)+
  labs(subtitle = "Silhouette method")

#### Elbow method
fviz_nbclust(dataHMm, kmeans, method = "wss",k.max = 16) +
  labs(subtitle = "Elbow method")

####gap stat slow!!!
####set.seed(123)
####fviz_nbclust(dataHMm, kmeans, nstart = 25,  method = "gap_stat", nboot = 100,k.max = 16)+
####  labs(subtitle = "Gap statistic method")
kclust4b <- kmeans(dataHMm, 3)
#silhouette plot
distK<-daisy(dataHMm)
plot(silhouette(kclust4b$cluster, distK), col=1:3, border=NA)

3. K-means clustering of means

split <- paste0("Cluster\n", kclust4b$cluster)
#split <- factor(paste0("Cluster\n", kclust3$cluster), levels=c("Cluster\n3","Cluster\n1","Cluster\n4","Cluster\n5","Cluster\n2","Cluster\n6"))
hmap_k <- Heatmap(dataHMm, split=split, cluster_row_slices = FALSE,
                  cluster_columns = FALSE,
                  show_row_names = FALSE,
                  name = "logfc",
                  col = col_funGR,
                  width = unit(50, "mm"),
                  column_title = "means logfc", 
                  column_title_gp = gpar(fontsize = 16, fontface = "bold"))
hmap_k+hmap_hier_factors6+hmap_hier_factors5

Mean profiles of clusters

clustercount<-data.frame(kclust4b$cluster)
clustersizes<-table(clustercount$kclust4b.cluster)
clusterMeans<-data.frame(kclust4b$centers)
clusterMeans1<-data.frame(t(clusterMeans))
clusterMeans1 <- cbind(rownames(clusterMeans1), clusterMeans1)
orderN<-c("Var14_RBC_0h","Var14_RBC_2h","Var14_RBC_6h","Var14_RBC_20h")#### manual
rownames(clusterMeans1) <- NULL
names(clusterMeans1)[names(clusterMeans1)=="rownames(clusterMeans1)"] <- "Sample"
####clusterMeans1
pX1<-ggplot(data=clusterMeans1, aes(x=Sample, y=X1,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X1 Profile ",clustersizes[1]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX2<-ggplot(data=clusterMeans1, aes(x=Sample, y=X2,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X2 Profile ",clustersizes[2]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX3<-ggplot(data=clusterMeans1, aes(x=Sample, y=X3,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X3 Profile ",clustersizes[3]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#pX4<-ggplot(data=clusterMeans1, aes(x=Sample, y=X4,group=1)) +
#  geom_line()+  geom_point()+ggtitle(paste("Cluster X4 Profile ",clustersizes[4]," genes"))+  scale_x_discrete(limits=orderN)+
#  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#pX5<-ggplot(data=clusterMeans1, aes(x=Sample, y=X5,group=1)) +
#  geom_line()+  geom_point()+ggtitle(paste("Cluster X5 Profile ",clustersizes[5]," genes"))+  scale_x_discrete(limits=orderN)+
#  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#pX6<-ggplot(data=clusterMeans1, aes(x=Sample, y=X6,group=1)) +
#  geom_line()+  geom_point()+ggtitle(paste("Cluster X6 Profile ",clustersizes[6]," genes"))+  scale_x_discrete(limits=orderN)+
#  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#plot
multiplot(pX1, pX2, pX3,  cols=2)

topDEgenes <- which(tempA$Include=="in")####find indexes
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
#### export the gene expression data for the clusters
write.table(clusterMeans,paste0("ClusterMeansKm_",groupsName,".txt"),  sep = "\t")
ClusteredGenes<-data.frame(kclust4b$cluster,SymbolsKm,dataHMm)
write.table(ClusteredGenes,paste0("ScaledDataInClustersKm_",groupsName,".txt"),  sep = "\t")
#head(ClusteredGenes)
bottomDEgenes<-which(tempA$Include=="out")####find indexes 
bottomG<-tempA[ bottomDEgenes, ]
bottomG<-dplyr::pull(bottomG, Gene_Symbol)
write.table(bottomG,paste0("ipaBottomKmeans_",groupsName,".txt"),  sep = "\t")
                         
topDEgenes <- which(tempA$Include=="in")####find indexes 
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
ipaKmeans<-ClusteredGenes
#countsTable <-countsTable[,c(1:15)]####if samples need removing
ipaKmeans<-ipaKmeans[,c(1:2)]
ipaKmeans$name2<-rownames(ipaKmeans)
#ipaKmeans%>% rownames_to_column(var = "rowname")
#ipaKmeans
#rowid_to_column(ipaKmeans)
ipaKmeans = mutate(ipaKmeans, x1= ifelse(ipaKmeans$kclust4b.cluster==1, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x2= ifelse(ipaKmeans$kclust4b.cluster==2, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x3= ifelse(ipaKmeans$kclust4b.cluster==3, "1", "0"))
#ipaKmeans = mutate(ipaKmeans, x4= ifelse(ipaKmeans$kclust4b.cluster==4, "1", "0"))
#ipaKmeans = mutate(ipaKmeans, x5= ifelse(ipaKmeans$kclust4b.cluster==5, "1", "0"))
#ipaKmeans = mutate(ipaKmeans, x6= ifelse(ipaKmeans$kclust4b.cluster==6, "1", "0"))
#ipaKmeans
write.table(ipaKmeans,paste0("ipaKmeans_",groupsName,".txt"),  sep = "\t")
#head(ipaKmeans)
ClusteredGenes2<-ClusteredGenes[c(1)]
#ClusteredGenes2
listAll<-list()
for(i in 1:3) {
  clusterName<-paste0("x",i)
  #clusterName<-row.names(subset(ClusteredGenes,ClusteredGenes==i))
  clusterName<-(subset(ClusteredGenes$SymbolsKm,ClusteredGenes==i))
  listAll[[i]]<-clusterName
}
#need to name the vectors in the list, example here is for 8 clusters
names(listAll)<-c("X1", "X2", "X3")
#if you want to rearrange the order
#listAll<-listAll[c("x3", "x7", "x8", "x2", "x6", "x5", "x4", "x1")]
lapply(listAll, head)
$X1
[1] "CYP1A1" "KCTD12" "TXNIP"  "DNAJB4" "PMAIP1" "USP53" 

$X2
[1] "PRR29"    "CLDN5"    "AMH"      "TRMT61A"  "DUS3L"    "EEF1A1P4"

$X3
[1] "ANGPTL4"    "KRT7"       "STARD4-AS1" "ZNF770"     "CEP295"    

4. Annotation of K-means clusters

  • CC cellular compartment
  • BP biological process
  • MF molecular function

The simplify function has been used to cut down on GO redundancy

#str(AllGeneNames)
####CC
cgoCC <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      ####OrgDb=org.Mm.eg.db,
                      keyType="SYMBOL",
                      ont = "CC", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoCC2 <- simplify(cgoCC, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoCC2),paste0("GO_CC_",groupsName,".csv"))
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Plots and GO data were written to files

png(paste0("GO_CC_",groupsName,".png"), width = 1224, height = 824)
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO BP

####CC
cgoBP <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db,
                      keyType="SYMBOL",
                      ont = "BP", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoBP2 <- simplify(cgoBP, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoBP2),paste0("GO_BP_",groupsName,".csv"))
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_BP_",groupsName,".png"), width = 1024, height = 1224)
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO MF

####MF
cgoMF <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      keyType="SYMBOL",
                      ont = "MF", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoMF2 <- simplify(cgoMF, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoMF2),paste0("GO_MF_",groupsName,".csv"))
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_MF_",groupsName,".png"), width = 1424, height = 824)
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

R5 VAR14 vs RBC TNF k-means q0.05

1. Genelist Selection

groupsName<-"R5_VAR14_RBC_TNF_kmeans_q0.05"
countsTable<-read.delim("RNAseq2019July_5.txt", header = TRUE, sep = "\t",check.names=FALSE,row.names=1)
head(countsTable)
AllGeneNames<-countsTable$Gene_Symbol
#head(AllGeneNames)
tempA<-countsTable
topDEgenes <- which(tempA$padj_R5_TNF_var14_vs_RBC_0h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_0h))####find indexes 
listA<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R5_TNF_var14_vs_RBC_2h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_2h))####find indexes 
listB<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R5_TNF_var14_vs_RBC_6h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_6h))####find indexes 
listC<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R5_TNF_var14_vs_RBC_20h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_20h))####find indexes 
listD<-tempA[ topDEgenes, ]$Gene_Symbol
vennq<-venn.diagram(x = list(listA,listB,listC,listD) ,
            category.names = c("Var14nNF_0h","Var14TNF_2h","Var14TNF_6h","Var14TNF_20h"),
            main="padj<0.05",
            filename = NULL,  scaled = FALSE, fill = colorsV4, cat.col = colorsV4, cat.cex = 1, cat.dist=0.3,  margin = 0.3)
topDEgenes <- which(tempA$pvalue_R5_TNF_var14_vs_RBC_0h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_0h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_0h))####find indexes 
listA<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$pvalue_R5_TNF_var14_vs_RBC_2h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_2h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_2h))####find indexes 
listB<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$pvalue_R5_TNF_var14_vs_RBC_6h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_6h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_6h))####find indexes 
listC<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$pvalue_R5_TNF_var14_vs_RBC_20h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_20h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_20h))####find indexes 
listD<-tempA[ topDEgenes, ]$Gene_Symbol
vennp<-venn.diagram(x = list(listA,listB,listC,listD) ,
            category.names = c("Var14TNF_0h","Var14TNF_2h","Var14TNF_6h","Var14TNF_20h"),
            main="pvalue<0.05&fold change>2",
            filename = NULL,  scaled = FALSE, fill = colorsV4, cat.col = colorsV4, cat.cex = 1, cat.dist=0.3,  margin = 0.3)
topDEgenes <- which(tempA$padj_R5_TNF_var14_vs_RBC_0h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_0h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_0h))####find indexes 
listA<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R5_TNF_var14_vs_RBC_2h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_2h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_2h))####find indexes 
listB<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R5_TNF_var14_vs_RBC_6h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_6h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_6h))####find indexes 
listC<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which(tempA$padj_R5_TNF_var14_vs_RBC_20h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_20h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_20h))####find indexes 
listD<-tempA[ topDEgenes, ]$Gene_Symbol
vennq2<-venn.diagram(x = list(listA,listB,listC,listD) ,
            category.names = c("Var14TNF_0h","Var14TNF_2h","Var14TNF_6h","Var14TNF_20h"),
            main="padj0.1&fold change>2",
            filename = NULL,  scaled = FALSE, fill = colorsV4, cat.col = colorsV4, cat.cex = 1, cat.dist=0.3,  margin = 0.3)
topDEgenes <- which((tempA$padj_R5_TNF_var14_vs_RBC_0h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_0h))| 
(tempA$padj_R5_TNF_var14_vs_RBC_2h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_2h))|
(tempA$padj_R5_TNF_var14_vs_RBC_6h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_6h))| 
(tempA$padj_R5_TNF_var14_vs_RBC_20h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_20h)) 
)
listA<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which((tempA$pvalue_R5_TNF_var14_vs_RBC_0h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_0h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_0h))| 
(tempA$pvalue_R5_TNF_var14_vs_RBC_2h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_2h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_2h))| 
(tempA$pvalue_R5_TNF_var14_vs_RBC_6h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_6h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_6h))| 
(tempA$pvalue_R5_TNF_var14_vs_RBC_20h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_20h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_20h))
 )####find indexes 
listC<-tempA[ topDEgenes, ]$Gene_Symbol
topDEgenes <- which((tempA$padj_R5_TNF_var14_vs_RBC_0h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_0h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_0h))| 
(tempA$padj_R5_TNF_var14_vs_RBC_2h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_2h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_2h))| 
(tempA$padj_R5_TNF_var14_vs_RBC_6h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_6h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_6h))| 
(tempA$padj_R5_TNF_var14_vs_RBC_20h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_20h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_20h))
 )####find indexes 
listB<-tempA[ topDEgenes, ]$Gene_Symbol
vennpq<-venn.diagram(x = list(listA,listB,listC) ,
            category.names = c("padj<0.05","padj<0.1&fc>2","p<0.05&fc>2"),
            main="padj compared to pvalue",
            filename = NULL,  scaled = FALSE, fill = colorsV3, cat.col = colorsV3, cat.cex = 1, cat.dist=0.1,  margin = 0.15)
grid.arrange(gTree(children=vennq), gTree(children=vennpq), ncol=2,top="R2 Var14 TNF")

#tempA<-resAll[-c(10:30) ]
tempA<-countsTable
#rownames(tempA)
rownames(tempA) <- NULL
tempA = mutate(tempA, Include=
                   ifelse(tempA$padj_R5_TNF_var14_vs_RBC_0h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_0h), "in",
                          ifelse(tempA$padj_R5_TNF_var14_vs_RBC_2h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_2h), "in",
                                 ifelse(tempA$padj_R5_TNF_var14_vs_RBC_6h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_6h), "in",
                                        ifelse(tempA$padj_R5_TNF_var14_vs_RBC_20h<0.05&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_20h), "in",
                                                 "out")))))
#tempA
####library(dplyr)
tempA %>%
     group_by(Include) %>% 
     tally()
topDEgenes <- which(tempA$Include=="in")####find indexes 
head(countsTable)

NB Please check columns used and renamed for plots

#baseMeansHm <-countsTable[,c(60:63)]
baseMeansHm <-countsTable[,c(110:113,129:132)]
head(baseMeansHm)
baseMeansHm2 <- log2(baseMeansHm+1)
baseMeansHm2$Var14_RBC_0h<-baseMeansHm2$Var14TNF_0h_mean-baseMeansHm2$RBC_TNF_0h_mean
baseMeansHm2$Var14_RBC_2h<-baseMeansHm2$Var14TNF_2h_mean-baseMeansHm2$RBC_TNF_2h_mean
baseMeansHm2$Var14_RBC_6h<-baseMeansHm2$Var14TNF_6h_mean-baseMeansHm2$RBC_TNF_6h_mean
baseMeansHm2$Var14_RBC_20h<-baseMeansHm2$Var14TNF_20h_mean-baseMeansHm2$RBC_TNF_20h_mean
baseMeansHm <-baseMeansHm2[,c(9:12)]
head(baseMeansHm)
baseMeansHmM <-baseMeansHm2[,c(1:8)]
head(baseMeansHmM)
topDEgenes <- which(tempA$Include=="in")####find indexes 

2. Hierachical clustering of means (individual samples added for inspection)

####mean logfc
dataHMm<-baseMeansHm[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm<- t(as.matrix(dataHMm))
dataHMm <- t(scale(dataHMm))
hmap_hier_factors4 <- Heatmap(
  dataHMm,  name = "logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means logfc"), 
  col = col_funGR,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
dataHMmPlot<-as.data.frame(dataHMm)
dataHMmPlot$Var14_RBC_2h<-dataHMmPlot$Var14_RBC_2h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_6h<-dataHMmPlot$Var14_RBC_6h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_20h<-dataHMmPlot$Var14_RBC_20h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_0h<-dataHMmPlot$Var14_RBC_0h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot<-as.matrix(dataHMmPlot)
  
hmap_hier_factors6 <- Heatmap(
  dataHMmPlot,  name = "Normalised logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("0h Normalised logfc"), 
  col = col_funGR2,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
####means
dataHMm3<-baseMeansHmM[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm3<- t(as.matrix(dataHMm3))
dataHMm3 <- t(scale(dataHMm3))
dataHMm3<-dataHMm3[,c(5,1,6,2,7,3,8,4)]
hmap_hier_factors5 <- Heatmap(
  dataHMm3,  name = "Expression",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means"), 
  col = col_fun,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(100, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
  
hmap_hier_factors4+hmap_hier_factors6+hmap_hier_factors5

par(mfrow=c(1,2))
#### Silhouette method
fviz_nbclust(dataHMm, kmeans, method = "silhouette",k.max = 16)+
  labs(subtitle = "Silhouette method")

#### Elbow method
fviz_nbclust(dataHMm, kmeans, method = "wss",k.max = 16) +
  labs(subtitle = "Elbow method")

####gap stat slow!!!
####set.seed(123)
####fviz_nbclust(dataHMm, kmeans, nstart = 25,  method = "gap_stat", nboot = 100,k.max = 16)+
####  labs(subtitle = "Gap statistic method")
kclust7 <- kmeans(dataHMm, 6)
#silhouette plot
distK<-daisy(dataHMm)
plot(silhouette(kclust7$cluster, distK), col=1:6, border=NA)

3. K-means clustering of means

split <- paste0("Cluster\n", kclust7$cluster)
#split <- factor(paste0("Cluster\n", kclust3$cluster), levels=c("Cluster\n3","Cluster\n1","Cluster\n4","Cluster\n5","Cluster\n2","Cluster\n6"))
hmap_k <- Heatmap(dataHMm, split=split, cluster_row_slices = FALSE,
                  cluster_columns = FALSE,
                  show_row_names = FALSE,
                  name = "logfc",
                  col = col_funGR,
                  width = unit(50, "mm"),
                  column_title = "means logfc", 
                  column_title_gp = gpar(fontsize = 16, fontface = "bold"))
hmap_k+hmap_hier_factors6+hmap_hier_factors5

Mean profiles of clusters

clustercount<-data.frame(kclust7$cluster)
clustersizes<-table(clustercount$kclust7.cluster)
clusterMeans<-data.frame(kclust7$centers)
clusterMeans1<-data.frame(t(clusterMeans))
clusterMeans1 <- cbind(rownames(clusterMeans1), clusterMeans1)
orderN<-c("Var14_RBC_0h","Var14_RBC_2h","Var14_RBC_6h","Var14_RBC_20h")#### manual
rownames(clusterMeans1) <- NULL
names(clusterMeans1)[names(clusterMeans1)=="rownames(clusterMeans1)"] <- "Sample"
pX1<-ggplot(data=clusterMeans1, aes(x=Sample, y=X1,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X1 Profile ",clustersizes[1]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX2<-ggplot(data=clusterMeans1, aes(x=Sample, y=X2,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X2 Profile ",clustersizes[2]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX3<-ggplot(data=clusterMeans1, aes(x=Sample, y=X3,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X3 Profile ",clustersizes[3]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX4<-ggplot(data=clusterMeans1, aes(x=Sample, y=X4,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X4 Profile ",clustersizes[4]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX5<-ggplot(data=clusterMeans1, aes(x=Sample, y=X5,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X5 Profile ",clustersizes[5]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX6<-ggplot(data=clusterMeans1, aes(x=Sample, y=X6,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X6 Profile ",clustersizes[6]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#plot
multiplot(pX1, pX2, pX3, pX4,pX5, pX6, cols=2)

topDEgenes <- which(tempA$Include=="in")####find indexes
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
#### export the gene expression data for the clusters
write.table(clusterMeans,paste0("ClusterMeansKm_",groupsName,".txt"),  sep = "\t")
ClusteredGenes<-data.frame(kclust7$cluster,SymbolsKm,dataHMm)
write.table(ClusteredGenes,paste0("ScaledDataInClustersKm_",groupsName,".txt"),  sep = "\t")
#head(ClusteredGenes)
bottomDEgenes<-which(tempA$Include=="out")####find indexes 
bottomG<-tempA[ bottomDEgenes, ]
bottomG<-dplyr::pull(bottomG, Gene_Symbol)
write.table(bottomG,paste0("ipaBottomKmeans_",groupsName,".txt"),  sep = "\t")
                         
topDEgenes <- which(tempA$Include=="in")####find indexes 
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
ipaKmeans<-ClusteredGenes
#countsTable <-countsTable[,c(1:15)]####if samples need removing
ipaKmeans<-ipaKmeans[,c(1:2)]
ipaKmeans$name2<-rownames(ipaKmeans)
#ipaKmeans%>% rownames_to_column(var = "rowname")
#ipaKmeans
#rowid_to_column(ipaKmeans)
ipaKmeans = mutate(ipaKmeans, x1= ifelse(ipaKmeans$kclust7.cluster==1, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x2= ifelse(ipaKmeans$kclust7.cluster==2, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x3= ifelse(ipaKmeans$kclust7.cluster==3, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x4= ifelse(ipaKmeans$kclust7.cluster==4, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x5= ifelse(ipaKmeans$kclust7.cluster==5, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x6= ifelse(ipaKmeans$kclust7.cluster==6, "1", "0"))
#ipaKmeans
write.table(ipaKmeans,paste0("ipaKmeans_",groupsName,".txt"),  sep = "\t")
#head(ipaKmeans)
ClusteredGenes2<-ClusteredGenes[c(1)]
#ClusteredGenes2
listAll<-list()
for(i in 1:6) {
  clusterName<-paste0("x",i)
  #clusterName<-row.names(subset(ClusteredGenes,ClusteredGenes==i))
  clusterName<-(subset(ClusteredGenes$SymbolsKm,ClusteredGenes==i))
  listAll[[i]]<-clusterName
}
#need to name the vectors in the list, example here is for 8 clusters
names(listAll)<-c("X1", "X2", "X3", "X4","X5", "X6")
#if you want to rearrange the order
#listAll<-listAll[c("x3", "x7", "x8", "x2", "x6", "x5", "x4", "x1")]
#lapply(listAll, head)

4. Annotation of K-means clusters

  • CC cellular compartment
  • BP biological process
  • MF molecular function

The simplify function has been used to cut down on GO redundancy

#str(AllGeneNames)
####CC
cgoCC <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      ####OrgDb=org.Mm.eg.db,
                      keyType="SYMBOL",
                      ont = "CC", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoCC2 <- simplify(cgoCC, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoCC2),paste0("GO_CC_",groupsName,".csv"))
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Plots and GO data were written to files

png(paste0("GO_CC_",groupsName,".png"), width = 1224, height = 824)
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO BP

####CC
cgoBP <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db,
                      keyType="SYMBOL",
                      ont = "BP", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoBP2 <- simplify(cgoBP, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoBP2),paste0("GO_BP_",groupsName,".csv"))
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_BP_",groupsName,".png"), width = 1024, height = 1224)
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO MF

####MF
cgoMF <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      keyType="SYMBOL",
                      ont = "MF", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoMF2 <- simplify(cgoMF, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoMF2),paste0("GO_MF_",groupsName,".csv"))
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_MF_",groupsName,".png"), width = 1424, height = 624)
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

R5 VAR14 vs RBC TNF k-means p0.05fc2

1. Genelist Selection

groupsName<-"R5_Var14vRBC_TNF_kmeans_p0.05fc2"
countsTable<-read.delim("RNAseq2019July_5.txt", header = TRUE, sep = "\t",check.names=FALSE,row.names=1)
head(countsTable)
AllGeneNames<-countsTable$Gene_Symbol
#head(AllGeneNames)
grid.arrange(gTree(children=vennp), gTree(children=vennpq) , ncol=2,top="R5 Var14 TNF")

#tempA<-resAll[-c(10:30) ]
tempA<-countsTable
#rownames(tempA)
rownames(tempA) <- NULL
tempA = mutate(tempA, Include=
                   ifelse(tempA$pvalue_R5_TNF_var14_vs_RBC_0h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_0h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_0h), "in",
                          ifelse(tempA$pvalue_R5_TNF_var14_vs_RBC_2h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_2h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_2h), "in",
                                 ifelse(tempA$pvalue_R5_TNF_var14_vs_RBC_6h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_6h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_6h), "in",
                                        ifelse(tempA$pvalue_R5_TNF_var14_vs_RBC_20h<0.05&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_20h)>1&!is.na(tempA$pvalue_R5_TNF_var14_vs_RBC_20h), "in",
                                                                       "out")))))
#tempA
####library(dplyr)
tempA %>%
     group_by(Include) %>% 
     tally()
topDEgenes <- which(tempA$Include=="in")####find indexes 

NB Please check columns used and renamed for plots

#baseMeansHm <-countsTable[,c(60:63)]
baseMeansHm <-countsTable[,c(110:113,129:132)]
head(baseMeansHm)
baseMeansHm2 <- log2(baseMeansHm+1)
baseMeansHm2$Var14_RBC_0h<-baseMeansHm2$Var14TNF_0h_mean-baseMeansHm2$RBC_TNF_0h_mean
baseMeansHm2$Var14_RBC_2h<-baseMeansHm2$Var14TNF_2h_mean-baseMeansHm2$RBC_TNF_2h_mean
baseMeansHm2$Var14_RBC_6h<-baseMeansHm2$Var14TNF_6h_mean-baseMeansHm2$RBC_TNF_6h_mean
baseMeansHm2$Var14_RBC_20h<-baseMeansHm2$Var14TNF_20h_mean-baseMeansHm2$RBC_TNF_20h_mean
baseMeansHm <-baseMeansHm2[,c(9:12)]
head(baseMeansHm)
baseMeansHmM <-baseMeansHm2[,c(1:8)]
head(baseMeansHmM)
topDEgenes <- which(tempA$Include=="in")####find indexes 

2. Hierachical clustering of means

####mean logfc
dataHMm<-baseMeansHm[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm<- t(as.matrix(dataHMm))
dataHMm <- t(scale(dataHMm))
hmap_hier_factors4 <- Heatmap(
  dataHMm,  name = "logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means logfc"), 
  col = col_funGR,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
dataHMmPlot<-as.data.frame(dataHMm)
dataHMmPlot$Var14_RBC_2h<-dataHMmPlot$Var14_RBC_2h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_6h<-dataHMmPlot$Var14_RBC_6h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_20h<-dataHMmPlot$Var14_RBC_20h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_0h<-dataHMmPlot$Var14_RBC_0h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot<-as.matrix(dataHMmPlot)
  
hmap_hier_factors6 <- Heatmap(
  dataHMmPlot,  name = "Normalised logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("0h Normalised logfc"), 
  col = col_funGR2,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
####means
dataHMm3<-baseMeansHmM[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm3<- t(as.matrix(dataHMm3))
dataHMm3 <- t(scale(dataHMm3))
dataHMm3<-dataHMm3[,c(5,1,6,2,7,3,8,4)]
hmap_hier_factors5 <- Heatmap(
  dataHMm3,  name = "Expression",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means"), 
  col = col_fun,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(100, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
  
hmap_hier_factors4+hmap_hier_factors6+hmap_hier_factors5

par(mfrow=c(1,2))
#### Silhouette method
fviz_nbclust(dataHMm, kmeans, method = "silhouette",k.max = 16)+
  labs(subtitle = "Silhouette method")

#### Elbow method
fviz_nbclust(dataHMm, kmeans, method = "wss",k.max = 16) +
  labs(subtitle = "Elbow method")

####gap stat slow!!!
####set.seed(123)
####fviz_nbclust(dataHMm, kmeans, nstart = 25,  method = "gap_stat", nboot = 100,k.max = 16)+
####  labs(subtitle = "Gap statistic method")
kclust8 <- kmeans(dataHMm, 6)
#silhouette plot
distK<-daisy(dataHMm)
plot(silhouette(kclust8$cluster, distK), col=1:6, border=NA)

3. K-means clustering of means

split <- paste0("Cluster\n", kclust8$cluster)
#split <- factor(paste0("Cluster\n", kclust3$cluster), levels=c("Cluster\n3","Cluster\n1","Cluster\n4","Cluster\n5","Cluster\n2","Cluster\n6"))
hmap_k <- Heatmap(dataHMm, split=split, cluster_row_slices = FALSE,
                  cluster_columns = FALSE,
                  show_row_names = FALSE,
                  name = "logfc",
                  col = col_funGR,
                  width = unit(50, "mm"),
                  column_title = "means logfc", 
                  column_title_gp = gpar(fontsize = 16, fontface = "bold"))
hmap_k+hmap_hier_factors6+hmap_hier_factors5

Mean profiles of clusters

clustercount<-data.frame(kclust8$cluster)
clustersizes<-table(clustercount$kclust8.cluster)
clusterMeans<-data.frame(kclust8$centers)
clusterMeans1<-data.frame(t(clusterMeans))
clusterMeans1 <- cbind(rownames(clusterMeans1), clusterMeans1)
orderN<-c("Var14_RBC_0h","Var14_RBC_2h","Var14_RBC_6h","Var14_RBC_20h")#### manual
rownames(clusterMeans1) <- NULL
names(clusterMeans1)[names(clusterMeans1)=="rownames(clusterMeans1)"] <- "Sample"
####clusterMeans1
pX1<-ggplot(data=clusterMeans1, aes(x=Sample, y=X1,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X1 Profile ",clustersizes[1]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX2<-ggplot(data=clusterMeans1, aes(x=Sample, y=X2,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X2 Profile ",clustersizes[2]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX3<-ggplot(data=clusterMeans1, aes(x=Sample, y=X3,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X3 Profile ",clustersizes[3]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX4<-ggplot(data=clusterMeans1, aes(x=Sample, y=X4,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X4 Profile ",clustersizes[4]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX5<-ggplot(data=clusterMeans1, aes(x=Sample, y=X5,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X5 Profile ",clustersizes[5]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX6<-ggplot(data=clusterMeans1, aes(x=Sample, y=X6,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X6 Profile ",clustersizes[6]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#plot
multiplot(pX1, pX2, pX3, pX4,pX5, pX6, cols=2)

topDEgenes <- which(tempA$Include=="in")####find indexes
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
#### export the gene expression data for the clusters
write.table(clusterMeans,paste0("ClusterMeansKm_",groupsName,".txt"),  sep = "\t")
ClusteredGenes<-data.frame(kclust8$cluster,SymbolsKm,dataHMm)
write.table(ClusteredGenes,paste0("ScaledDataInClustersKm_",groupsName,".txt"),  sep = "\t")
#head(ClusteredGenes)
bottomDEgenes<-which(tempA$Include=="out")####find indexes 
bottomG<-tempA[ bottomDEgenes, ]
bottomG<-dplyr::pull(bottomG, Gene_Symbol)
write.table(bottomG,paste0("ipaBottomKmeans_",groupsName,".txt"),  sep = "\t")
                         
topDEgenes <- which(tempA$Include=="in")####find indexes 
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
ipaKmeans<-ClusteredGenes
#countsTable <-countsTable[,c(1:15)]####if samples need removing
ipaKmeans<-ipaKmeans[,c(1:2)]
ipaKmeans$name2<-rownames(ipaKmeans)
#ipaKmeans%>% rownames_to_column(var = "rowname")
#ipaKmeans
#rowid_to_column(ipaKmeans)
ipaKmeans = mutate(ipaKmeans, x1= ifelse(ipaKmeans$kclust8.cluster==1, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x2= ifelse(ipaKmeans$kclust8.cluster==2, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x3= ifelse(ipaKmeans$kclust8.cluster==3, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x4= ifelse(ipaKmeans$kclust8.cluster==4, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x5= ifelse(ipaKmeans$kclust8.cluster==5, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x6= ifelse(ipaKmeans$kclust8.cluster==6, "1", "0"))
#ipaKmeans
write.table(ipaKmeans,paste0("ipaKmeans_",groupsName,".txt"),  sep = "\t")
#head(ipaKmeans)
ClusteredGenes2<-ClusteredGenes[c(1)]
#ClusteredGenes2
listAll<-list()
for(i in 1:6) {
  clusterName<-paste0("x",i)
  #clusterName<-row.names(subset(ClusteredGenes,ClusteredGenes==i))
  clusterName<-(subset(ClusteredGenes$SymbolsKm,ClusteredGenes==i))
  listAll[[i]]<-clusterName
}
#need to name the vectors in the list, example here is for 8 clusters
names(listAll)<-c("X1", "X2", "X3", "X4","X5", "X6")
#if you want to rearrange the order
#listAll<-listAll[c("x3", "x7", "x8", "x2", "x6", "x5", "x4", "x1")]
#lapply(listAll, head)

4. Annotation of K-means clusters

  • CC cellular compartment
  • BP biological process
  • MF molecular function

The simplify function has been used to cut down on GO redundancy

#str(AllGeneNames)
####CC
cgoCC <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      ####OrgDb=org.Mm.eg.db,
                      keyType="SYMBOL",
                      ont = "CC", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoCC2 <- simplify(cgoCC, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoCC2),paste0("GO_CC_",groupsName,".csv"))
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Plots and GO data were written to files

png(paste0("GO_CC_",groupsName,".png"), width = 1224, height = 824)
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO BP

####CC
cgoBP <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db,
                      keyType="SYMBOL",
                      ont = "BP", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoBP2 <- simplify(cgoBP, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoBP2),paste0("GO_BP_",groupsName,".csv"))
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_BP_",groupsName,".png"), width = 1024, height = 1224)
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO MF

####MF
cgoMF <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      keyType="SYMBOL",
                      ont = "MF", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoMF2 <- simplify(cgoMF, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoMF2),paste0("GO_MF_",groupsName,".csv"))
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_MF_",groupsName,".png"), width = 1424, height = 824)
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

R5 VAR14 vs RBC TNF k-means padj0.1fc2

1. Genelist Selection

groupsName<-"R5_Var14vRBC_TNF_kmeans_padj0.1fc2"
countsTable<-read.delim("RNAseq2019July_5.txt", header = TRUE, sep = "\t",check.names=FALSE,row.names=1)
head(countsTable)
AllGeneNames<-countsTable$Gene_Symbol
#head(AllGeneNames)
grid.arrange(gTree(children=vennq2), gTree(children=vennpq) , ncol=2,top="R5 Var14 TNF")

#tempA<-resAll[-c(10:30) ]
tempA<-countsTable
#rownames(tempA)
rownames(tempA) <- NULL
tempA = mutate(tempA, Include=
                   ifelse(tempA$padj_R5_TNF_var14_vs_RBC_0h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_0h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_0h), "in",
                          ifelse(tempA$padj_R5_TNF_var14_vs_RBC_2h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_2h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_2h), "in",
                                 ifelse(tempA$padj_R5_TNF_var14_vs_RBC_6h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_6h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_6h), "in",
                                        ifelse(tempA$padj_R5_TNF_var14_vs_RBC_20h<0.1&abs(tempA$log2FoldChange_R5_TNF_var14_vs_RBC_20h)>1&!is.na(tempA$padj_R5_TNF_var14_vs_RBC_20h), "in",
                                                                       "out")))))
#tempA
####library(dplyr)
tempA %>%
     group_by(Include) %>% 
     tally()
topDEgenes <- which(tempA$Include=="in")####find indexes 

NB Please check columns used and renamed for plots

#baseMeansHm <-countsTable[,c(60:63)]
baseMeansHm <-countsTable[,c(110:113,129:132)]
head(baseMeansHm)
baseMeansHm2 <- log2(baseMeansHm+1)
baseMeansHm2$Var14_RBC_0h<-baseMeansHm2$Var14TNF_0h_mean-baseMeansHm2$RBC_TNF_0h_mean
baseMeansHm2$Var14_RBC_2h<-baseMeansHm2$Var14TNF_2h_mean-baseMeansHm2$RBC_TNF_2h_mean
baseMeansHm2$Var14_RBC_6h<-baseMeansHm2$Var14TNF_6h_mean-baseMeansHm2$RBC_TNF_6h_mean
baseMeansHm2$Var14_RBC_20h<-baseMeansHm2$Var14TNF_20h_mean-baseMeansHm2$RBC_TNF_20h_mean
baseMeansHm <-baseMeansHm2[,c(9:12)]
head(baseMeansHm)
baseMeansHmM <-baseMeansHm2[,c(1:8)]
head(baseMeansHmM)
topDEgenes <- which(tempA$Include=="in")####find indexes 

2. Hierachical clustering of means

####mean logfc
dataHMm<-baseMeansHm[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm<- t(as.matrix(dataHMm))
dataHMm <- t(scale(dataHMm))
hmap_hier_factors4 <- Heatmap(
  dataHMm,  name = "logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means logfc"), 
  col = col_funGR,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
dataHMmPlot<-as.data.frame(dataHMm)
dataHMmPlot$Var14_RBC_2h<-dataHMmPlot$Var14_RBC_2h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_6h<-dataHMmPlot$Var14_RBC_6h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_20h<-dataHMmPlot$Var14_RBC_20h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot$Var14_RBC_0h<-dataHMmPlot$Var14_RBC_0h-dataHMmPlot$Var14_RBC_0h
dataHMmPlot<-as.matrix(dataHMmPlot)
  
hmap_hier_factors6 <- Heatmap(
  dataHMmPlot,  name = "Normalised logfc",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("0h Normalised logfc"), 
  col = col_funGR2,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(50, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
####means
dataHMm3<-baseMeansHmM[ topDEgenes, ]
#dataHMm <- log2(dataHMm+1)
dataHMm3<- t(as.matrix(dataHMm3))
dataHMm3 <- t(scale(dataHMm3))
dataHMm3<-dataHMm3[,c(5,1,6,2,7,3,8,4)]
hmap_hier_factors5 <- Heatmap(
  dataHMm3,  name = "Expression",
  row_labels = paste0(rownames(dataHMm)," ",(tempA[ topDEgenes, ])$Gene_Symbol),
  column_title = paste0("Means"), 
  col = col_fun,
  column_title_gp = gpar(fontsize = 16, fontface = "bold"),
  width = unit(100, "mm"),
  cluster_columns = FALSE,
  show_row_names = FALSE)
  
hmap_hier_factors4+hmap_hier_factors6+hmap_hier_factors5

par(mfrow=c(1,2))
#### Silhouette method
fviz_nbclust(dataHMm, kmeans, method = "silhouette",k.max = 16)+
  labs(subtitle = "Silhouette method")

#### Elbow method
fviz_nbclust(dataHMm, kmeans, method = "wss",k.max = 16) +
  labs(subtitle = "Elbow method")

####gap stat slow!!!
####set.seed(123)
####fviz_nbclust(dataHMm, kmeans, nstart = 25,  method = "gap_stat", nboot = 100,k.max = 16)+
####  labs(subtitle = "Gap statistic method")
kclust8b <- kmeans(dataHMm, 6)
#silhouette plot
distK<-daisy(dataHMm)
plot(silhouette(kclust8b$cluster, distK), col=1:6, border=NA)

3. K-means clustering of means

split <- paste0("Cluster\n", kclust8b$cluster)
#split <- factor(paste0("Cluster\n", kclust3$cluster), levels=c("Cluster\n3","Cluster\n1","Cluster\n4","Cluster\n5","Cluster\n2","Cluster\n6"))
hmap_k <- Heatmap(dataHMm, split=split, cluster_row_slices = FALSE,
                  cluster_columns = FALSE,
                  show_row_names = FALSE,
                  name = "logfc",
                  col = col_funGR,
                  width = unit(50, "mm"),
                  column_title = "means logfc", 
                  column_title_gp = gpar(fontsize = 16, fontface = "bold"))
hmap_k+hmap_hier_factors6+hmap_hier_factors5

Mean profiles of clusters

clustercount<-data.frame(kclust8b$cluster)
clustersizes<-table(clustercount$kclust8b.cluster)
clusterMeans<-data.frame(kclust8b$centers)
clusterMeans1<-data.frame(t(clusterMeans))
clusterMeans1 <- cbind(rownames(clusterMeans1), clusterMeans1)
orderN<-c("Var14_RBC_0h","Var14_RBC_2h","Var14_RBC_6h","Var14_RBC_20h")#### manual
rownames(clusterMeans1) <- NULL
names(clusterMeans1)[names(clusterMeans1)=="rownames(clusterMeans1)"] <- "Sample"
####clusterMeans1
pX1<-ggplot(data=clusterMeans1, aes(x=Sample, y=X1,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X1 Profile ",clustersizes[1]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX2<-ggplot(data=clusterMeans1, aes(x=Sample, y=X2,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X2 Profile ",clustersizes[2]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX3<-ggplot(data=clusterMeans1, aes(x=Sample, y=X3,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X3 Profile ",clustersizes[3]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX4<-ggplot(data=clusterMeans1, aes(x=Sample, y=X4,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X4 Profile ",clustersizes[4]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX5<-ggplot(data=clusterMeans1, aes(x=Sample, y=X5,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X5 Profile ",clustersizes[5]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
pX6<-ggplot(data=clusterMeans1, aes(x=Sample, y=X6,group=1)) +
  geom_line()+  geom_point()+ggtitle(paste("Cluster X6 Profile ",clustersizes[6]," genes"))+  scale_x_discrete(limits=orderN)+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
#plot
multiplot(pX1, pX2, pX3, pX4,pX5, pX6, cols=2)

topDEgenes <- which(tempA$Include=="in")####find indexes
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
#### export the gene expression data for the clusters
write.table(clusterMeans,paste0("ClusterMeansKm_",groupsName,".txt"),  sep = "\t")
ClusteredGenes<-data.frame(kclust8b$cluster,SymbolsKm,dataHMm)
write.table(ClusteredGenes,paste0("ScaledDataInClustersKm_",groupsName,".txt"),  sep = "\t")
#head(ClusteredGenes)
bottomDEgenes<-which(tempA$Include=="out")####find indexes 
bottomG<-tempA[ bottomDEgenes, ]
bottomG<-dplyr::pull(bottomG, Gene_Symbol)
write.table(bottomG,paste0("ipaBottomKmeans_",groupsName,".txt"),  sep = "\t")
                         
topDEgenes <- which(tempA$Include=="in")####find indexes 
tempAkm<-tempA[ topDEgenes, ]
SymbolsKm<-dplyr::pull(tempAkm, Gene_Symbol)
ipaKmeans<-ClusteredGenes
#countsTable <-countsTable[,c(1:15)]####if samples need removing
ipaKmeans<-ipaKmeans[,c(1:2)]
ipaKmeans$name2<-rownames(ipaKmeans)
#ipaKmeans%>% rownames_to_column(var = "rowname")
#ipaKmeans
#rowid_to_column(ipaKmeans)
ipaKmeans = mutate(ipaKmeans, x1= ifelse(ipaKmeans$kclust8b.cluster==1, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x2= ifelse(ipaKmeans$kclust8b.cluster==2, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x3= ifelse(ipaKmeans$kclust8b.cluster==3, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x4= ifelse(ipaKmeans$kclust8b.cluster==4, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x5= ifelse(ipaKmeans$kclust8b.cluster==5, "1", "0"))
ipaKmeans = mutate(ipaKmeans, x6= ifelse(ipaKmeans$kclust8b.cluster==6, "1", "0"))
#ipaKmeans
write.table(ipaKmeans,paste0("ipaKmeans_",groupsName,".txt"),  sep = "\t")
#head(ipaKmeans)
ClusteredGenes2<-ClusteredGenes[c(1)]
#ClusteredGenes2
listAll<-list()
for(i in 1:6) {
  clusterName<-paste0("x",i)
  #clusterName<-row.names(subset(ClusteredGenes,ClusteredGenes==i))
  clusterName<-(subset(ClusteredGenes$SymbolsKm,ClusteredGenes==i))
  listAll[[i]]<-clusterName
}
#need to name the vectors in the list, example here is for 8 clusters
names(listAll)<-c("X1", "X2", "X3", "X4","X5", "X6")
#if you want to rearrange the order
#listAll<-listAll[c("x3", "x7", "x8", "x2", "x6", "x5", "x4", "x1")]
#lapply(listAll, head)

4. Annotation of K-means clusters

  • CC cellular compartment
  • BP biological process
  • MF molecular function

The simplify function has been used to cut down on GO redundancy

#str(AllGeneNames)
####CC
cgoCC <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      ####OrgDb=org.Mm.eg.db,
                      keyType="SYMBOL",
                      ont = "CC", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoCC2 <- simplify(cgoCC, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoCC2),paste0("GO_CC_",groupsName,".csv"))
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Plots and GO data were written to files

png(paste0("GO_CC_",groupsName,".png"), width = 1224, height = 824)
dotplot(cgoCC2,showCategory = 30,
        title = paste0("GO Cellular Compartment ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO BP

####CC
cgoBP <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db,
                      keyType="SYMBOL",
                      ont = "BP", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoBP2 <- simplify(cgoBP, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoBP2),paste0("GO_BP_",groupsName,".csv"))
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_BP_",groupsName,".png"), width = 1024, height = 1224)
dotplot(cgoBP2,showCategory = 30,
        title = paste0("GO Biological Process ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

GO MF

####MF
cgoMF <- compareCluster(geneCluster = listAll, 
                      universe = AllGeneNames,
                      fun = "enrichGO",
                      OrgDb=org.Hs.eg.db, 
                      keyType="SYMBOL",
                      ont = "MF", 
                      pvalueCutoff=0.05,
                      qvalueCutoff = 0.10)
cgoMF2 <- simplify(cgoMF, cutoff=0.7, by="p.adjust", select_fun=min)
####write as spreadsheet
write.csv(as.data.frame(cgoMF2),paste0("GO_MF_",groupsName,".csv"))
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

png(paste0("GO_MF_",groupsName,".png"), width = 1424, height = 824)
dotplot(cgoMF2,showCategory = 30,
        title = paste0("GO Molecular Function  ",groupsName))+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dev.off()
null device 
          1 

save: once happy with clustering save workspace so that it can be recalled

save.image(file="KmDecember.RData")

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

