library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
sce <- readRDS("real/zheng_2017_monocytes/data/01_sce_all_genes_all_cells.rds")

# raw counts
Y_raw <- as.matrix(assay(sce, "counts"))
libsize <- colSums(Y_raw)

# Count per million (cpm)
Y_cpm <- 1e6 * t(t(Y_raw) / libsize)

# Count per 10k (cp10k)
Y_cp10k <- 1e4 * t(t(Y_raw) / libsize)

# Count per median (cpmed)
Y_cpmed <- median(libsize) * t(t(Y_raw) / libsize)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
library(viridis)
## Loading required package: viridisLite
theme_set(theme_bw())
#g <- match(20, apply(Y_raw, 1, max))
#tibble(umi = Y_raw[g,]) %>% ggplot(aes(x = umi)) + geom_bar()
Y_raw_l2 <- log2(Y_raw+1)
Y_cpm_l2 <- log2(Y_cpm+1)
Y_cp10k_l2 <- log2(Y_cp10k+1)
Y_cpmed_l2 <- log2(Y_cpmed+1)
plot_histgrams <- function (g) {
  tibble(
    raw = Y_raw_l2[g,],
    cpm = Y_cpm_l2[g,],
    cp10k = Y_cp10k_l2[g,],
    cpmed = Y_cpmed_l2[g,],
  ) %>%
  gather(key = "norm", val = "value") %>%
  mutate(norm = factor(norm, levels = c("raw", "cpm", "cp10k", "cpmed"))) %>%
  ggplot(aes(x = value)) +
    geom_histogram(bins = 80) +
    facet_wrap(~norm) +
    ggtitle(rownames(Y_raw)[g])
}
gg <- match(c(10, 15, 20, 25, 30), apply(Y_raw, 1, max))
plot_histgrams(gg[1])

plot_histgrams(gg[2])

plot_histgrams(gg[3])

plot_histgrams(gg[4])

plot_histgrams(gg[5])

pc_raw <- prcomp(t(Y_raw_l2), rank = 10)
pc_cpm <- prcomp(t(Y_cpm_l2), rank = 10)
pc_cp10k <- prcomp(t(Y_cp10k_l2), rank = 10)
pc_cpmed <- prcomp(t(Y_cpmed_l2), rank = 10)
zero_fraction <- colMeans(Y_raw == 0)
log_total_umi <- log10(colSums(Y_raw))
tibble(
  raw = pc_raw$x[,1],
  cpm = pc_cpm$x[,1],
  cp10k = pc_cp10k$x[,1],
  cpmed = pc_cpmed$x[,1]
) %>%
  gather(key = "norm", value = "pc1") %>%
  mutate(
    norm = factor(norm, levels = c("raw", "cpm", "cp10k", "cpmed")),
    zero_fraction = rep(zero_fraction, 4),
    log_total_umi = rep(log_total_umi, 4)) %>%
  ggplot(aes(x = zero_fraction, y = pc1, color = log_total_umi)) +
    geom_point(size = 0.2) +
    scale_color_viridis() +
    facet_wrap(~norm, scales = "free_y")