rm(list=ls())

quad <- read.table("all_PQS_and_TSS_mappings.csv", sep=",",header=TRUE,stringsAsFactors=FALSE)
colnames(quad)[11:19] <- c("stacks","fa","fc","ft","ng","length2","l1","l2","l3")

dim <- 5

# split into plus and minus strands, keep only row entries with PQS present
# keep only one copy of each unique PQS
all <- quad[with(quad, is.na(delta) == FALSE & !duplicated(row_number)), ]

# analysis of all PQS with 4 stacks and 3 loops in which the proximal and terminal loop base for each loop != G
all1a <- all[with(all, is.na(l1) == FALSE),]
all2 <- all1a[,c(5,12,13,14,17,18,19)]
rownames(all2) <- all2$row_number
# Load the kohonen package 
require(kohonen)

# Create a training data set (rows are samples, columns are variables
# Here I am selecting a subset of my variables available in "data"
data_train <- all2

# Change the data frame with training data to a matrix
# Also center and scale all variables to give them equal importance during
# the SOM training process. 
data_train_matrix <- as.matrix(scale(data_train))
colnames(data_train_matrix) <- colnames(data_train)

# Create the SOM Grid - you generally have to specify the size of the 
# training grid prior to training the SOM. Hexagonal and Circular 
# topologies are possible
x <- dim
y <- dim
som_grid <- somgrid(xdim = x, ydim=y, topo="hexagonal")

# Finally, train the SOM, options for the number of iterations,
# the learning rates, and the neighbourhood are available
som_model <- som(data_train_matrix, grid=som_grid, rlen=100, alpha=c(0.05,0.01), keep.data = TRUE, n.hood="circular")


pdf("kohonen.pdf")
# bottom left top right
library("gplots")
library("gtools")
library("colorRamps")
#plot(som_model, type="changes",main="Changes", font=2, font.lab=2, font.axis=2)
par(mar=c(0,0,0,0))
plot(som_model, type="count", main="Count", font.main=2)
plot(som_model, type="dist.neighbours", main="Distance to Neighbors", font.main=2)
# unscaled property plots
for(i in 1:7){
  var <- i #define the variable to plot 
  var_unscaled <- aggregate(as.numeric(data_train[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] 
  plot(som_model, type = "property", property=var_unscaled, main=names(data_train)[var], font.main=2)
}

plot(som_model, type="codes", main="Codes")
mydata <- som_model$codes 
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) 
for (i in 2:15) {
  wss[i] <- sum(kmeans(mydata, centers=i)$withinss)
}
#plot(wss)
## use hierarchical clustering to cluster the codebook vectors
k=6
som_cluster <- cutree(hclust(dist(som_model$codes)), k=k, h= NULL)
# plot these results:
col3 <- colorRampPalette(c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"))(k)
plot(som_model, type="mapping", bgcol = col3[som_cluster], main = "Clusters") 
add.cluster.boundaries(som_model, som_cluster)

all3 <- cbind(all1a, as.data.frame(som_model$unit.classif))
colnames(all3)[51] <- "class"
write.table(all3, "classification.csv",sep=",",row.names=FALSE, col.names=colnames(all3))
to <- x*y+0.5
bins <- seq(from=0.5, to=to, by=1)
h_all <- hist(all3$class, breaks=bins, plot=FALSE)

# import dataset for Bloom Syndrome Patients
bdat <- read.table("differentially-expressed_genes_BS.csv", sep=",", header=TRUE)
# split gene sets into up- and down-regulated
blm.up1 <- as.data.frame(bdat[with(bdat, logFC > 0), ]$gene)
blm.down1 <- as.data.frame(bdat[with(bdat, logFC < 0), ]$gene)
colnames(blm.up1) <- colnames(blm.down1) <- "gene_name"
# merge blm.up1 and blm.down1 datasets with quad --> full TSS and PQS lists
blm.up <- merge(all3, blm.up1, by="gene_name") # isolate entries of all3 from differentially-expressed genes
h_blm.up <- hist(blm.up$class, breaks=bins, plot=FALSE)
blm.down <- merge(all3, blm.down1, by="gene_name")
h_blm.down <- hist(blm.down$class, breaks=bins, plot=FALSE)

# calculate numbers of genes in respective datasets
genomic.tss <- length(unique(quad$unique.gene.number))
# calculate number of genes in blm up and down datasets (not all genes from blm.up1 and blm.down1 lerge with quad dataset).
# Calcualte the number of genes that actually merge into dataset
blm.up.tss1 <- merge(quad, blm.up1, by="gene_name")
blm.up.tss <- length(unique(blm.up.tss1$gene_name))
blm.down.tss1 <- merge(quad, blm.down1, by="gene_name")
blm.down.tss <- length(unique(blm.down.tss1$gene_name))

## botstrap simulation to identify threshold for enrichment/depletion significance for each cluster
# make random gene sets
gene.symbols <- unique(quad$gene_name)
out <- list()
for(i in 1:100){
  rand.up1 <- data.frame(sample(gene.symbols, blm.up.tss))
  rand.down1 <- data.frame(sample(gene.symbols, blm.down.tss))
  colnames(rand.up1) <- colnames(rand.down1) <- "gene_name"
 
  rand.up <- merge(all3, rand.up1, by="gene_name")
  h_rand.up <- hist(rand.up$class, breaks=bins, plot=FALSE)
  rand.down <- merge(all3, rand.down1, by="gene_name")
  h_rand.down <- hist(rand.down$class, breaks=bins, plot=FALSE)
  
  rand.up.tss <- nrow(rand.up1)
  rand.down.tss <- nrow(rand.down1)
  
  en_rand.up <- log2((h_rand.up$counts/rand.up.tss)/(h_all$counts/genomic.tss))
  en_rand.down <- log2((h_rand.down$counts/rand.down.tss)/(h_all$counts/genomic.tss))
  sav <- list()
  sav[[1]] <- en_rand.up
  sav[[2]] <- en_rand.down
  out[[i]] <- sav
  print(i)
}
# calculate per node distributions to determing threshold levels
up.mean <- NULL
up.sd <- NULL
down.mean <- NULL
down.sd <- NULL
for(i in 1:length(out[[1]][1][[1]])){
  up <- NULL
  down <- NULL
  for(j in 1:100){
    up <- c(up, out[[j]][1][[1]][i])
    up <- up[up != "-Inf"]
    down <- c(down, out[[j]][2][[1]][i])
    down <- down[down != "-Inf"]
    print(j)
  }
  up.mean <- c(up.mean, mean(up))
  up.sd <- c(up.sd,sd(up))
  down.mean <- c(down.mean, mean(down))
  down.sd <- c(down.sd, sd(down))
  print("i==.....................")
  print(i)
  print("......................")
}
up.u.limit <- up.mean+2*up.sd
up.l.limit <- up.mean-2*up.sd
down.u.limit <- down.mean+2*down.sd
down.l.limit <- down.mean-2*down.sd

############################
en_blm.up <- log2((h_blm.up$counts/blm.up.tss)/(h_all$counts/genomic.tss))
# raplace values with 0 if the probability of the observation is not outside of the 95% CI
en_blm.up2 <- NULL
for(i in 1:length(en_blm.up)){
  v <- en_blm.up[i]
  if(v > up.l.limit[i] && v < up.u.limit[i]){
    v <- 0
  }
  en_blm.up2 <- c(en_blm.up2,v)
  print(i)
}

# en_blm.up <- replace(en_blm.up, en_blm.up == "-Inf",-3)
# en_blm.up <- replace(en_blm.up, en_blm.up < -3,-3)

en_blm.down <- log2((h_blm.down$counts/blm.down.tss)/(h_all$counts/genomic.tss))
en_blm.down2 <- NULL
for(i in 1:length(en_blm.down)){
  v <- en_blm.down[i]
  if(v > down.l.limit[i] && v < down.u.limit[i]){
    v <- 0
  }
  en_blm.down2 <- c(en_blm.down2,v)
  print(i)
}

# en_blm.down <- replace(en_blm.down, en_blm.down == "-Inf",-3)
# en_blm.down <- replace(en_blm.down, en_blm.down < -3,-3)

# set scale 
col3 <- colorRampPalette(c("red","gray","green"))(50)
plot(som_model, type="mapping", bgcol = col3[round((round(en_blm.up2,2)+2.5)*10)], main = "BS: Up-Regulated") 
plot(som_model, type="mapping", bgcol = col3[round((round(en_blm.down2,2)+2.5)*10)], main = "BS: Down-Regulated") 

################ 
## Werner Syndrome


# import dataset for Werner Syndrome Patients
bdat <- read.table("differentially-expressed_genes_WS.csv", sep=",", header=TRUE)
# split gene sets into up- and down-regulated
blm.up1 <- as.data.frame(bdat[with(bdat, logFC > 0), ]$gene)
blm.down1 <- as.data.frame(bdat[with(bdat, logFC < 0), ]$gene)
colnames(blm.up1) <- colnames(blm.down1) <- "gene_name"
# merge blm.up1 and blm.down1 datasets with quad --> full TSS and PQS lists
blm.up <- merge(all3, blm.up1, by="gene_name") # isolate entries of all3 from differentially-expressed genes
h_blm.up <- hist(blm.up$class, breaks=bins, plot=FALSE)
blm.down <- merge(all3, blm.down1, by="gene_name")
h_blm.down <- hist(blm.down$class, breaks=bins, plot=FALSE)

# calculate numbers of genes in respective datasets
genomic.tss <- length(unique(quad$unique.gene.number))
# calculate number of genes in blm up and down datasets (not all genes from blm.up1 and blm.down1 lerge with quad dataset).
# Calcualte the number of genes that actually merge into dataset
blm.up.tss1 <- merge(quad, blm.up1, by="gene_name")
blm.up.tss <- length(unique(blm.up.tss1$gene_name))
blm.down.tss1 <- merge(quad, blm.down1, by="gene_name")
blm.down.tss <- length(unique(blm.down.tss1$gene_name))

## botstrap simulation to identify threshold for enrichment/depletion significance for each cluster
# make random gene sets
gene.symbols <- unique(quad$gene_name)
out <- list()
for(i in 1:100){
  rand.up1 <- data.frame(sample(gene.symbols, blm.up.tss))
  rand.down1 <- data.frame(sample(gene.symbols, blm.down.tss))
  colnames(rand.up1) <- colnames(rand.down1) <- "gene_name"
  
  rand.up <- merge(all3, rand.up1, by="gene_name")
  h_rand.up <- hist(rand.up$class, breaks=bins, plot=FALSE)
  rand.down <- merge(all3, rand.down1, by="gene_name")
  h_rand.down <- hist(rand.down$class, breaks=bins, plot=FALSE)
  
  rand.up.tss <- nrow(rand.up1)
  rand.down.tss <- nrow(rand.down1)
  
  en_rand.up <- log2((h_rand.up$counts/rand.up.tss)/(h_all$counts/genomic.tss))
  en_rand.down <- log2((h_rand.down$counts/rand.down.tss)/(h_all$counts/genomic.tss))
  sav <- list()
  sav[[1]] <- en_rand.up
  sav[[2]] <- en_rand.down
  out[[i]] <- sav
  print(i)
}
# calculate per node distributions to determing threshold levels
up.mean <- NULL
up.sd <- NULL
down.mean <- NULL
down.sd <- NULL
for(i in 1:length(out[[1]][1][[1]])){
  up <- NULL
  down <- NULL
  for(j in 1:100){
    up <- c(up, out[[j]][1][[1]][i])
    up <- up[up != "-Inf"]
    down <- c(down, out[[j]][2][[1]][i])
    down <- down[down != "-Inf"]
    print(j)
  }
  up.mean <- c(up.mean, mean(up))
  up.sd <- c(up.sd,sd(up))
  down.mean <- c(down.mean, mean(down))
  down.sd <- c(down.sd, sd(down))
  print("i==.....................")
  print(i)
  print("......................")
}
up.u.limit <- up.mean+2*up.sd
up.l.limit <- up.mean-2*up.sd
down.u.limit <- down.mean+2*down.sd
down.l.limit <- down.mean-2*down.sd

############################
en_blm.up <- log2((h_blm.up$counts/blm.up.tss)/(h_all$counts/genomic.tss))
# raplace values with 0 if the probability of the observation is not outside of the 95% CI
en_blm.up2 <- NULL
for(i in 1:length(en_blm.up)){
  v <- en_blm.up[i]
  if(v > up.l.limit[i] && v < up.u.limit[i]){
    v <- 0
  }
  en_blm.up2 <- c(en_blm.up2,v)
  print(i)
}

# en_blm.up <- replace(en_blm.up, en_blm.up == "-Inf",-3)
# en_blm.up <- replace(en_blm.up, en_blm.up < -3,-3)

en_blm.down <- log2((h_blm.down$counts/blm.down.tss)/(h_all$counts/genomic.tss))
en_blm.down2 <- NULL
for(i in 1:length(en_blm.down)){
  v <- en_blm.down[i]
  if(v > down.l.limit[i] && v < down.u.limit[i]){
    v <- 0
  }
  en_blm.down2 <- c(en_blm.down2,v)
  print(i)
}

# en_blm.down <- replace(en_blm.down, en_blm.down == "-Inf",-3)
# en_blm.down <- replace(en_blm.down, en_blm.down < -3,-3)

# set scale 
col3 <- colorRampPalette(c("red","gray","green"))(50)
plot(som_model, type="mapping", bgcol = col3[round((round(en_blm.up2,2)+2.5)*10)], main = "WS: Up-Regulated") 
plot(som_model, type="mapping", bgcol = col3[round((round(en_blm.down2,2)+2.5)*10)], main = "WS: Down-Regulated") 

# plot color scale
col3 <- colorRampPalette(c("red","gray","green"))(300)
x <- 1:300
y <- rep(1,300)
plot(x,y,col=col3[x],pch=16)

dev.off()