suppressWarnings({
  suppressPackageStartupMessages({
    library(data.table)
    library(ggthemes)
    library(openxlsx)
    library(tidyverse)
    library(patchwork)
    library(pheatmap)
    library(lme4)
    library(ggrepel)
    library(harmony)    
  })
})

NATO <- c('alpha', 'bravo', 'charlie', 'delta', 'echo')#, 'foxtrot', 'golf', 'hotel', 'india', 'juliett')
palette_global <- c(tableau_color_pal('Tableau 10')(3), tableau_color_pal('Tableau 10')(10))
names(palette_global) <- Reduce(c, list(c('LOW', 'MEDIUM', 'HIGH'), LETTERS[1:5], NATO))
# palette_global <- c(palette_global)
# names(palette_global) <- names(palette_global)

cosine_normalize <- function(X, margin) {
    if (margin == 1) {
        res <- sweep(as.matrix(X), 1, sqrt(rowSums(X ^ 2)), '/')
        row.names(res) <- row.names(X)
        colnames(res) <- colnames(X)        
    } else {
        res <- sweep(as.matrix(X), 2, sqrt(colSums(X ^ 2)), '/')
        row.names(res) <- row.names(X)
        colnames(res) <- colnames(X)
    }
    return(res)
}

do_scatter <- function(umap_use, meta_data, label_name, no_guides = TRUE, do_labels = TRUE, nice_names, palette_use = palette_global,
                       pt_size = 4, point_size = .5, base_size = 14, do_points = TRUE, do_density = FALSE, h = 6, w = 8) {
    plt_df <- umap_use %>% data.frame() %>% 
        cbind(meta_data) %>% 
        dplyr::sample_frac(1L) 
    plt_df$given_name <- plt_df[[label_name]]
    
    if (!missing(nice_names)) {
        plt_df %<>%
            dplyr::inner_join(nice_names, by = "given_name") %>% 
            subset(nice_name != "" & !is.na(nice_name))

        plt_df[[label_name]] <- plt_df$nice_name        
    }
        
    plt <- plt_df %>% 
        ggplot(aes_string("X1", "X2", col = label_name, fill = label_name)) + 
        theme_tufte(base_size = base_size) + 
        theme(panel.background = element_rect(fill = NA, color = "black")) + 
        guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4)), alpha = FALSE) +
        scale_color_manual(values = palette_use) + 
        scale_fill_manual(values = palette_use) +    
        theme(plot.title = element_text(hjust = .5)) + 
        labs(x = "UMAP 1", y = "UMAP 2") 
    
    if (do_points) 
#         plt <- plt + geom_point_rast(dpi = 300, width = w, height = h, size = point_size) 
        plt <- plt + geom_point(shape = '.')
    if (do_density) 
        plt <- plt + geom_density_2d()    
        

    if (no_guides)
        plt <- plt + guides(col = FALSE, fill = FALSE, alpha = FALSE)
    
    if (do_labels) 
        plt <- plt + geom_label_repel(data = data.table(plt_df)[, .(X1 = mean(X1), X2 = mean(X2)), by = label_name], label.size = NA,
                                      aes_string(label = label_name), color = "white", size = pt_size, alpha = 1, segment.size = 0) + 
        guides(col = FALSE, fill = FALSE)
    return(plt)
}

plot_cluster_membership <- function(harmonyObj, nrow=2, plot_corr=TRUE) {
    if (plot_corr) {
        data <- t(harmonyObj$Z_corr) %>% data.frame()
    } else {
        data <- t(harmonyObj$Z_orig) %>% data.frame()        
    }
    data %>%
        tibble::rowid_to_column('id') %>% 
        dplyr::inner_join(
            harmonyObj$R %>% t() %>% data.table() %>% 
                tibble::rowid_to_column('id') %>%
                tidyr::gather(cluster, r, -id) %>% 
                dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)), 
            by = 'id'
        ) %>% 
        dplyr::sample_frac(1L) %>% 
        ggplot(aes(X1, X2, color = r)) + 
            geom_point(shape = '.') + 
            theme_tufte(base_size = 12) + theme(panel.background = element_rect()) + 
    #         facet_grid(cluster ~ .) + 
            facet_wrap(~cluster, nrow = nrow) + 
            scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 
            labs(x = 'Harmony 1', y = 'Harmony 2', title = 'Probabilistic cluster assignments')
    
}

1 Clusters

simulate_clusters <- function(xmus, xsds, ymus, ysds, ncells=500, ngenes=100, prop1, prop2, 
                   seed=0, batch_sd=1, cell_batch=FALSE, add_before=TRUE) {
    set.seed(seed)
    proj <- matrix(rnorm(ngenes*ncells), nrow=ngenes, ncol=2)
    K <- length(xmus)

    
    ## initialize batch 1
    comp1 <- sample(1:K, prob=prop1, size=ncells, replace=TRUE)
    samples1 <- cbind(rnorm(n=ncells, mean=xmus[comp1],sd=xsds[comp1]),
                      rnorm(n=ncells, mean=ymus[comp1],sd=ysds[comp1]))    
    
    ## initialize batch 2
    comp2 <- sample(1:K, prob=prop2, size=ncells, replace=TRUE) 
    samples2 <- cbind(rnorm(n=ncells, mean=xmus[comp2],sd=xsds[comp2]),
                      rnorm(n=ncells, mean=ymus[comp2],sd=ysds[comp2]))
    
    if (add_before) {
#         message('adding batch effect to low dimensions')
        
        ## add gene-specific batch noise
        if (cell_batch) {
            batch_mat1 <- matrix(rnorm(K * 2, sd = batch_sd), K, 2)[comp1, ]
            batch_mat2 <- matrix(rnorm(K * 2, sd = batch_sd), K, 2)[comp2, ]
        } else {
            batch_mat1 <- matrix(rep(rnorm(2, sd = batch_sd), each=ncells), ncol=2)
            batch_mat2 <- matrix(rep(rnorm(2, sd = batch_sd), each=ncells), ncol=2)
        }

        A1 <- (samples1 + batch_mat1) %*% t(proj)
        A1 <- A1 + rnorm(ngenes*ncells)
        A2 <- (samples2 + batch_mat2) %*% t(proj)    
        A2 <- A2 + rnorm(ngenes*ncells)    
        
    } else {

#         message('adding batch effect to genes')
        A1 <- samples1 %*% t(proj)    
        A1 <- A1 + rnorm(ngenes*ncells)
        A2 <- samples2 %*% t(proj)
        A2 <- A2 + rnorm(ngenes*ncells)    

        ## add gene-specific batch noise
        if (cell_batch) {
            batch_mat1 <- matrix(rnorm(K * ngenes, sd = batch_sd), K, ngenes)[comp1, ]
            batch_mat2 <- matrix(rnorm(K * ngenes, sd = batch_sd), K, ngenes)[comp2, ]
        } else {
            batch_mat1 <- matrix(rep(rnorm(ngenes, sd = batch_sd), each=ncells), ncol=ngenes)
            batch_mat2 <- matrix(rep(rnorm(ngenes, sd = batch_sd), each=ncells), ncol=ngenes)
        }

        A1 <- A1 + batch_mat1    
        A2 <- A2 + batch_mat2

    }

    pca_res <- rbind(A1, A2) %>% t() %>% svd()
    V <- pca_res$v %*% diag(pca_res$d)
    
    return(list(X = rbind(samples1, samples2), 
                V = V, 
                meta_data = data.table(type = c(LETTERS[comp1], LETTERS[comp2]),
                                       batch = rep(c('alpha', 'bravo'), c(ncells, ncells)))))
}

1.1 Simple case

First, a negative control

simdata <- simulate_clusters(ncells = 500, ngenes = 20,
    xmus = c(-5, 5), xsds = c(1, 1),
    ymus = c(-5, 5), ysds = c(1, 1),
    prop1 = c(.5, .5), prop2 = c(.5, .5), 
    batch_sd=0,
    seed=1
)

do_scatter(simdata$V, simdata$meta_data, 'batch') + 
    labs(x = 'PC 1', y = 'PC 2', subtitle = 'Colored by batch', title = 'Before Harmony') + 
do_scatter(simdata$V, simdata$meta_data, 'type') + 
    labs(x = 'PC 1', y = 'PC 2', subtitle = 'Colored by type') + 
NULL

harmonyRes <- HarmonyMatrix(simdata$V, simdata$meta_data, 'batch', theta = 1, 
                            do_pca = FALSE, verbose = FALSE, 
                            lambda = 1, nclust = 20, max.iter.cluster = 10)

do_scatter(harmonyRes, simdata$meta_data, 'batch') + 
    labs(x = 'Harmony 1', y = 'Harmony 2', subtitle = 'Colored by batch', title = 'After Harmony') + 
do_scatter(harmonyRes, simdata$meta_data, 'type') + 
    labs(x = 'Harmony 1', y = 'Harmony 2', subtitle = 'Colored by type') + 
NULL

Now, let’s add batch effect:

simdata <- simulate_clusters(ncells = 500, ngenes = 20,
    xmus = c(-5, 5), xsds = c(1, 1),
    ymus = c(-5, 5), ysds = c(1, 1),
    prop1 = c(.5, .5), prop2 = c(.5, .5), 
    batch_sd=3,
    seed=1
)

do_scatter(simdata$V, simdata$meta_data, 'batch') + 
    labs(x = 'PC 1', y = 'PC 2', subtitle = 'Colored by batch', title = 'Before Harmony') + 
do_scatter(simdata$V, simdata$meta_data, 'type') + 
    labs(x = 'PC 1', y = 'PC 2', subtitle = 'Colored by type') + 
NULL

harmonyRes <- HarmonyMatrix(simdata$V, simdata$meta_data, 'batch',
                            do_pca = FALSE, verbose = FALSE, 
                            theta = 1, lambda = 1, nclust = 20, max.iter.cluster = 10)

do_scatter(harmonyRes, simdata$meta_data, 'batch') + 
    labs(x = 'Harmony 1', y = 'Harmony 2', subtitle = 'Colored by batch', title = 'After Harmony') + 
do_scatter(harmonyRes, simdata$meta_data, 'type') + 
    labs(x = 'Harmony 1', y = 'Harmony 2', subtitle = 'Colored by type') + 
NULL

1.2 Cluster redundancy

What happens if we set K, the number of kmeans clusters, too high?

simdata <- simulate_clusters(ncells = 500, ngenes = 20,
    xmus = c(-5, 5), xsds = c(1, 1),
    ymus = c(-5, 5), ysds = c(1, 1),
    prop1 = c(.5, .5), prop2 = c(.5, .5), 
    batch_sd=0,
    seed=1
)

harmonyObj <- HarmonyMatrix(simdata$V, simdata$meta_data, 'batch', theta = 1, lambda = 1, nclust = 20, 
                            do_pca = FALSE, verbose = FALSE, 
                            max.iter.cluster = 10, return_object = TRUE)

Below we can visualize the probabilistic cluster assignment of each cell to each of the 20 clusters.

plot_cluster_membership(harmonyObj, nrow=2)

It seems many of the clusters are redundant. Let’s take a look at how cluster assignment is correlated between the 20 clusters. Indeed, there only seem to be 2 non-redundant clusters.

pheatmap(cor(t(harmonyObj$R)), main = 'Correlation of cluster membership')

1.3 Sensitivity to density

It is important for Harmony to be able to integrate datasets in which the proportions of cell types are unequal. In this analysis, we consider a wide range of proportion difference, from even (50:50) to exclusive (100:0).

plt_list <- lapply(list(c(.5, .5), c(.6, .4), c(.7, .3), c(.8, .2), c(.9, .1), c(1, 0)), function(prop_use) {
    simdata <- simulate_clusters(ncells = 500, ngenes = 20,
        xmus = c(-5, 5), xsds = c(1, 1),
        ymus = c(-5, 5), ysds = c(1, 1),
        prop1 = prop_use, prop2 = rev(prop_use), 
        batch_sd=0,
        seed=1, add_before=FALSE
    )
    
    harmonyRes <- HarmonyMatrix(simdata$V, simdata$meta_data, 'batch', theta = 1, do_pca = FALSE, 
                                lambda = 1, nclust = 20, 
                                max.iter.cluster = 10, verbose = FALSE)
        
    rbind(
        data.frame(harmonyRes[, 1:2]) %>% dplyr::mutate(embed = 'After Harmony') %>% cbind(simdata$meta_data) %>% 
            tidyr::gather(key, val, type, batch),
        data.frame(simdata$V[, 1:2]) %>% dplyr::mutate(embed = 'Before Harmony') %>% cbind(simdata$meta_data) %>% 
            tidyr::gather(key, val, type, batch)
    ) %>% 
        dplyr::mutate(embed = factor(embed, c('Before Harmony', 'After Harmony'))) %>% 
        dplyr::sample_frac(1L, FALSE) %>% 
        ggplot(aes(X1, X2)) + geom_point(shape = '.', aes(color = val)) + 
            facet_wrap(~embed + key, nrow = 1) + 
            theme_tufte(base_size = 12) + geom_rangeframe(color = 'black') + 
            theme(axis.title = element_blank(), plot.title = element_text(size = 10)) + 
            scale_color_manual(values = palette_global) + 
            labs(title = sprintf('Cell type proportions: %d%% and %d%%', round(100 * prop_use[1]), round(100 * prop_use[2]))) + 
            guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4)), alpha = FALSE) +
            NULL    
})

Reduce(`+`, plt_list) + 
plot_layout(ncol = 1)

Now let’s add batch effect.

plt_list <- lapply(list(c(.5, .5), c(.6, .4), c(.7, .3), c(.8, .2), c(.9, .1), c(1, 0)), function(prop_use) {
    simdata <- simulate_clusters(ncells = 500, ngenes = 20,
        xmus = c(-5, 5), xsds = c(1, 1),
        ymus = c(-5, 5), ysds = c(1, 1),
        prop1 = prop_use, prop2 = rev(prop_use), 
        batch_sd=3,
        seed=1, add_before=FALSE
    )
    
    harmonyRes <- HarmonyMatrix(simdata$V, simdata$meta_data, 'batch', 
                                do_pca = FALSE, theta = 1, lambda = 1, nclust = 20, 
                                max.iter.cluster = 10, verbose = FALSE)
        
    rbind(
        data.frame(harmonyRes[, 1:2]) %>% dplyr::mutate(embed = 'After Harmony') %>% cbind(simdata$meta_data) %>% 
            tidyr::gather(key, val, type, batch),
        data.frame(simdata$V[, 1:2]) %>% dplyr::mutate(embed = 'Before Harmony') %>% cbind(simdata$meta_data) %>% 
            tidyr::gather(key, val, type, batch)
    ) %>% 
        dplyr::mutate(embed = factor(embed, c('Before Harmony', 'After Harmony'))) %>% 
        dplyr::sample_frac(1L, FALSE) %>% 
        ggplot(aes(X1, X2)) + geom_point(shape = '.', aes(color = val)) + 
            facet_wrap(~embed + key, nrow = 1) + 
            theme_tufte(base_size = 12) + geom_rangeframe(color = 'black') + 
            theme(axis.title = element_blank(), plot.title = element_text(size = 10)) + 
            scale_color_manual(values = palette_global) + 
            labs(title = sprintf('Cell type proportions: %d%% and %d%%', round(100 * prop_use[1]), round(100 * prop_use[2]))) + 
            guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4)), alpha = FALSE) +
            NULL    
})

Reduce(`+`, plt_list) + 
plot_layout(ncol = 1)

1.4 Local batch effect

Now let’s consider what happens when batch effect is different for each cell type. We do this by setting the cell_batch parameter to TRUE in the simulation function.

simdata <- simulate_clusters(ncells = 500, ngenes = 20,
    xmus = c(-5, 5), xsds = c(1, 1),
    ymus = c(-5, 5), ysds = c(1, 1),
    prop1 = c(.5, .5), prop2 = c(.5, .5), 
    batch_sd=3, cell_batch = TRUE,
    seed=1
)

Now, the batch effect for cell types A (red) and B (teal) are in completely different directions. A is separate along PC1 and PC2, while B is only separate along PC1.

do_scatter(simdata$V, simdata$meta_data, 'batch') + 
    labs(x = 'PC 1', y = 'PC 2', subtitle = 'Colored by batch', title = 'Before Harmony') + 
do_scatter(simdata$V, simdata$meta_data, 'type') + 
    labs(x = 'PC 1', y = 'PC 2', subtitle = 'Colored by type') + 
NULL

Harmony learns cell-type specific correction factors to integrate the data correctly.

harmonyRes <- HarmonyMatrix(simdata$V, simdata$meta_data, 'batch', theta = 2, do_pca = FALSE, 
                            verbose = FALSE, 
                            lambda = 1, nclust = 20, max.iter.cluster = 10)

do_scatter(harmonyRes, simdata$meta_data, 'batch') + 
    labs(x = 'Harmony 1', y = 'Harmony 2', subtitle = 'Colored by batch', title = 'After Harmony') + 
do_scatter(harmonyRes, simdata$meta_data, 'type') + 
    labs(x = 'Harmony 1', y = 'Harmony 2', subtitle = 'Colored by type') + 
NULL

1.5 Ambiguous: Confounded batch and biology in similar cell types

Here we consider a scenario in which batch-specific and cell-type-specific signals are confounded. To illustrate, consider the following example. We have 3 populations that represent LOW, MEDIUM, and HIGH cell types. Batch alpha has the LOW and MEDIUM populations. Batch bravo has only MEDIUM and HIGH populations. This situation raises a question of interpretation: what should be considered a good integration?

This situations can be divided into two cases, which Harmony treats differently:

1) when batch effect is systematic, Harmony preserves relative ordering
2) when batch effect is cell-type specific, Harmony preserves absolute ordering

1.5.1 Case I: systematic shifts

First, we consider systematic shifts. Here, the same batch effect is applied to all cell types within a batch.

simdata <- simulate_clusters(ncells = 500, ngenes = 20, 
    xmus = c(0, 5, 10), xsds = c(1, 1, 1),
    ymus = c(10, 5, 0), ysds = c(1, 1, 1),
    prop1 = c(.5, .5, 0), prop2 = c(0, .5, .5),
    batch_sd=5, add_before = FALSE, 
    seed=3,
    cell_batch = FALSE
)

nice_names <- data.table(given_name = LETTERS[1:3], nice_name = c('LOW', 'MEDIUM', 'HIGH'))

do_scatter(simdata$V, simdata$meta_data, 'batch') + 
    labs(x = 'PC 1', y = 'PC 2', subtitle = 'Colored by batch', title = 'Before Harmony') + 
do_scatter(simdata$V, simdata$meta_data, 'type', nice_names = nice_names) + 
    labs(x = 'PC 1', y = 'PC 2', subtitle = 'Colored by type') + 
NULL

In this case, Harmony maintains the relative order of cells within a batch. That is, Harmony merges alpha:LOW with bravo:MEDIUM and alpha:MEDIUM with bravo:HIGH.

harmonyRes <- HarmonyMatrix(simdata$V, simdata$meta_data, 'batch',
                            theta = 1, lambda = 1, nclust = 20, do_pca = FALSE, 
                            max.iter.cluster = 20, max.iter.harmony = 20,
                            verbose = FALSE)

do_scatter(harmonyRes, simdata$meta_data, 'batch') + 
    labs(x = 'Harmony 1', y = 'Harmony 2', subtitle = 'Colored by batch') + 
do_scatter(harmonyRes, simdata$meta_data, 'type', nice_names = nice_names, do_labels = FALSE) + 
    labs(x = 'Harmony 1', y = 'Harmony 2', subtitle = 'Colored by type') + 
    facet_wrap(~batch) + 
    plot_layout(widths = c(1, 1.5)) + 
plot_annotation(title = 'Relative ordering maintained with systematic shifts') + 
NULL

1.5.2 Case II: cell-type specific shifts

In this case, we apply a different batch effect to every cell type within each batch.

simdata <- simulate_clusters(ncells = 500, ngenes = 20, 
    xmus = c(0, 5, 10), xsds = c(1, 1, 1),
    ymus = c(10, 5, 0), ysds = c(1, 1, 1),
    prop1 = c(.5, .5, 0), prop2 = c(0, .5, .5),
    batch_sd=5, add_before = FALSE, 
    seed=3,
    cell_batch = TRUE
)

nice_names <- data.table(given_name = LETTERS[1:3], nice_name = c('LOW', 'MEDIUM', 'HIGH'))

do_scatter(simdata$V, simdata$meta_data, 'batch') + 
    labs(x = 'PC 1', y = 'PC 2', subtitle = 'Colored by batch', title = 'Before Harmony') + 
do_scatter(simdata$V, simdata$meta_data, 'type', nice_names = nice_names) + 
    labs(x = 'PC 1', y = 'PC 2', subtitle = 'Colored by type') + 
NULL

In this case, Harmony preserves the absolute ordering of cell types. That is, LOW, MEDIUM, and HIGH stay separate but the MEDIUM groups are integrated.

harmonyRes <- HarmonyMatrix(simdata$V, simdata$meta_data, 'batch',
                            theta = 1, lambda = 1, nclust = 20, 
                            max.iter.cluster = 10, do_pca = FALSE, 
                            verbose = FALSE)

do_scatter(harmonyRes, simdata$meta_data, 'batch') + 
    labs(x = 'Harmony 1', y = 'Harmony 2', subtitle = 'Colored by batch') + 
do_scatter(harmonyRes, simdata$meta_data, 'type', nice_names = nice_names) + 
    labs(x = 'Harmony 1', y = 'Harmony 2', subtitle = 'Colored by type') + 
plot_annotation(title = 'Absolute ordering maintained with cell-type specific shifts') + 
NULL

1.5.3 Interpretation

Why does Harmony treat these two situations differently? Following Occam’s razor, Harmony finds the simplest solution. When there is a systematic shift, Harmony can model all 3 cell types with a single linear model. That is, each batch gets 1 correction term, so Harmony can explain all the batch variation with a total of 2 terms. When batch effect is cell-type specific, a single linear model does not suffice. Since each cell type gets its own correction term, Harmony needs 3 linear models to model the data.

1.6 Limits: which part of batch is removed?

There are many cases, in real datasets, in which cell types are so different that Harmony does not integrate them. In this section, we analyze with simulated data how different cell types have to be in order to not be integrated. To quantify the size of batch effect within each cell type, we use Variance Components Analysis. In short, this analysis fits a random effects model for each feature (here, PCs) and computes the variance explained by each variable, such as batch and cell type.

do_vca <- function(eigen.mat, meta, n.eigen, norm_to_pc=FALSE) {
    npc.in <- ifelse(missing(n.eigen), ncol(eigen.mat), n.eigen)
    
    meta <- as.data.frame(meta)
    pred.list <- colnames(meta)
    meta <- droplevels(meta)
    n.preds <- ncol(meta) + 1
    ran.pred.list <- c()
    for (i in 1:ncol(meta)) {
        ran.pred.list <- c(ran.pred.list, paste0("(1|", pred.list[i], ")"))
    }

    formula <- paste(ran.pred.list, collapse = " + ")
    formula <- paste("pc", formula, sep = " ~ ")
    ran.var.mat <- NULL
    for (i in 1:npc.in) {
#         message(sprintf("BEGIN ITER %d", i))
        dat <- cbind(eigen.mat[, i], meta)
        colnames(dat) <- c("pc", colnames(meta))
        suppressMessages({
            Rm1ML <- lme4::lmer(formula, dat, REML = TRUE, verbose = FALSE, 
                na.action = na.omit)            
        })
        var.vec <- unlist(VarCorr(Rm1ML))
        ran.var.mat <- rbind(ran.var.mat, c(var.vec[pred.list], resid = sigma(Rm1ML)^2))
    }
    if (norm_to_pc) {
        ran.var.mat.std <- ran.var.mat/rowSums(ran.var.mat)
    } else {
        ran.var.mat.std <- ran.var.mat/sum(ran.var.mat)
    }
    ran.var.mat.std %>% data.frame() %>% tibble::rowid_to_column("pc") %>% 
        dplyr::mutate(pc_str = paste0("PC", pc)) %>%
        tidyr::gather(variable, val, -pc, -pc_str) %>% 
        dplyr::mutate(val = 100 * val)
    
}

To see how this works, let’s consider two analyses: with no batch and with a sizeable batch effect.

simdata_nobatch <- simulate_clusters(ncells = 500, ngenes = 20,
    xmus = c(-5, 5),xsds = c(1, 1),
    ymus = c(-5, 5), ysds = c(1, 1),
    prop1 = c(.5, .5), prop2 = c(.5, .5), 
    batch_sd=0, add_before = FALSE, 
    seed=2
)

simdata_bigbatch <- simulate_clusters(ncells = 500, ngenes = 20,
    xmus = c(-5, 5), xsds = c(1, 1),
    ymus = c(-5, 5), ysds = c(1, 1),
    prop1 = c(.5, .5), prop2 = c(.5, .5), 
    batch_sd=10, add_before = FALSE, 
    seed=2
)

We can visualize the per-PC variance attributed to batch and cell type. The plots show that when we add batch noise, the contribution of batch increases in PCs 1, 2, and 3.

suppressWarnings({
    vca_nobatch <- data.table(do_vca(simdata_nobatch$V, simdata_nobatch$meta_data))
    vca_bigbatch <- data.table(do_vca(simdata_bigbatch$V, simdata_bigbatch$meta_data))    
})

vca_nobatch %>% 
    ggplot(aes(reorder(pc_str, -pc), val, fill = variable)) + geom_bar(stat = 'identity') + 
    theme_tufte(base_size = 12) + geom_rangeframe() + 
    coord_flip() + 
    labs(y = '% Variance', x = '', title = 'No batch added') + 
    guides(fill = FALSE) + 
    scale_fill_tableau() + 
vca_bigbatch %>% 
    ggplot(aes(reorder(pc_str, -pc), val, fill = variable)) + geom_bar(stat = 'identity') + 
    theme_tufte(base_size = 12) + geom_rangeframe() + 
    coord_flip() + 
    labs(y = '% Variance', x = '', title = 'Batch added') + 
    scale_fill_tableau() + 
    NULL

Now let’s compute the signal (cell type variance) to noise (batch variance) ratio across a range of conditions.

res <- Reduce(rbind, lapply(seq(1, 20, by = 1), function(noise) {

    simdata <- simulate_clusters(ncells = 500, ngenes = 20,
        xmus = c(-5, 5), xsds = c(1, 1),
        ymus = c(-5, 5), ysds = c(1, 1),
        prop1 = c(.5, .5), prop2 = c(.5, .5), 
        batch_sd=noise, add_before = FALSE, 
        seed=1
    )
    suppressWarnings({
        x <- data.table(do_vca(simdata$V, simdata$meta_data))[, sum(val), by = variable]
        snr <- x[variable == 'type', V1] / x[variable == 'batch', V1]        
    })
    
    Reduce(rbind, lapply(c(0, .5, 1, 1.5, 2), function(theta_use) {
        harmonyRes <- HarmonyMatrix(simdata$V, simdata$meta_data, 'batch',
                                    do_pca = FALSE, return_object = TRUE, verbose = FALSE, 
                                    theta = theta_use, lambda = 1, nclust = 20, max.iter.cluster = 10)
        chi2 <- sum(((harmonyRes$O - harmonyRes$E) ^ 2) / harmonyRes$E)
        data.table(noise_sigma = noise, snr = snr, type = x[variable == 'type', V1], resid = x[variable == 'resid', V1], batch = x[variable == 'batch', V1], chi2 = chi2, theta = theta_use)
    }))    
}))

The total variable explained by cell type (red) decreases as we add more batch noise. At sd=8, the signal and noise are roughly equal. Beyond this, the effect of batch exceeds that of cell type.

res %>% 
    dplyr::select(noise_sigma, type, batch, resid) %>% unique() %>% 
    tidyr::gather(key, val, -noise_sigma) %>% 
    ggplot(aes(noise_sigma, val, fill = key)) + 
        geom_bar(stat = 'identity') + 
        labs(x = 'Batch SD', y = '% Variance', fill = '') + 
        theme_tufte(base_size = 12) + geom_rangeframe() + 
        geom_hline(yintercept = 50, linetype = 2) + 
        scale_fill_tableau() + 
        scale_x_continuous(breaks = 1:20) + 
        NULL

To see how well Harmony integrates under each condition, we measure the (chi-squared) independence between batch and Harmony clusters. Below, we plot the Harmony cluster diversity (y-axis) against the signal to noise ratio (x-axis). We noticed that chi-squared had 3 values, corresponding to the following situations:

1) chi2 around 1000: nothing was integrated
2) chi2 around 500: one cell type was integrated
3) chi2 around 0: both cell types were integrated

As the SNR increased, Harmony was more likely to integrate one or both cell types. Moreover, Harmony made more diverse clusters with larger values of theta.

res %>% 
    dplyr::mutate(theta = paste0('Theta = ', theta)) %>%
    dplyr::mutate(num_integrated = case_when(
        chi2 < 200 ~ 'both', 
        chi2 > 800 ~ 'none', 
        TRUE ~ 'one'
    )) %>% 
    ggplot(aes(chi2, log(snr))) + 
        geom_point(shape = 21, aes(color = factor(num_integrated, c('none', 'one', 'both')))) + #, position = position_jitter(width = 50)) + 
        theme_tufte(base_size = 12) + 
        geom_hline(yintercept = 0, linetype = 2) + 
        theme(panel.background = element_rect()) + 
        facet_wrap(~theta, nrow = 1) + 
        scale_color_tableau() + 
        labs(x = 'Chi-squared', y = 'Log Signal to Noise Ratio', color = 'cell types\nintegrated') +
#         scale_y_continuous(trans = 'log10') + 
        coord_flip() + 
        NULL

To see the dependence on theta, we plotted theta against the minimum SNR required to integrate both cell types. Clearly, theta <1 required a positive SNR, while theta >1 did not.

data.table(res)[, .SD[chi2 < 800] %>% with(min(snr)), by = theta] %>% 
    ggplot(aes(theta, log(V1))) + geom_point() + 
        geom_hline(yintercept = 0, linetype = 2) + 
        theme_tufte(base_size = 12) + 
        theme(panel.background = element_rect()) + 
        labs(y = 'Log SNR', x = 'Theta', title = 'Minimum signal to noise ratio\n needed to integrate') + 
        NULL

data.table(res)[, .SD[chi2 < 800][order(snr)] %>% head(1), by = theta]

##    theta noise_sigma       snr     type     resid    batch         chi2
## 1:   0.0           4 3.4198828 75.00211 3.0666962 21.93119   0.10014461
## 2:   0.5           6 1.5191097 58.85392 2.4037087 38.74237 518.45321580
## 3:   1.0           8 0.8525239 45.16992 1.8463299 52.98375 518.69216898
## 4:   1.5          11 0.4507176 30.67923 1.2532444 68.06752 508.68656597
## 5:   2.0          15 0.2421447 19.33998 0.7905186 79.86950   0.09273983

# data.table(res)[, .SD[chi2 < 800][order(snr)] %>% head(1), by = theta][, broom::tidy(lm(log(snr) ~ theta, .SD))]

This analysis demonstrates two important aspects of about theta. First, if the batch effect is small enough (here, SNR > 3.4), then theta=0 is sufficient for integration. Second, setting theta >=1 allows Harmony to work even when the effect of batch is equal to or greather than biology. This behavior powers Harmony to integrate across vastly different types of datasets.

2 Trajectories

We adapted the cluster simulation procedure above to simulate trajectories. In order to ensure that cells have similar distance from the origin, we simulate cells along the unit circle in 2D space. With a fixed radius, we sample cell location in polar coordinates. In other words, each cell gets a random angle to the x-axis.

sample_angle <- function(n, theta_min = 0, theta_max = pi, betaA, betaB) {
    y <- rbeta(n, betaA, betaB) ## sample values from 0 to 1 w/ beta distribution
    z <- y * (theta_max - theta_min) ## scale it 
    z <- z - theta_min ## shift it
    return(z)
}

simulate_trajectory <- function(starts, ends, ncells, ngenes=100, seed=0, batch_sd=1, betaAs = c(.4, .4), betaBs = c(.4, .4)) {
    set.seed(seed)
    proj <- matrix(rnorm(ngenes*ncells), nrow=ngenes, ncol=2)
    
    ## initialize batch 1
    angle1 <- sample_angle(ncells, starts[1], ends[1], betaAs[1], betaBs[1])
    x1 <- cos(angle1)
    y1 <- sin(angle1)
    samples1 <- cbind(x1, y1)
    samples1 <- samples1 + matrix(rnorm(2*ncells, sd = .05), ncells, 2)
    
    ## initialize batch 1
    angle2 <- sample_angle(ncells, starts[2], ends[2], betaAs[2], betaBs[2])
    x2 <- cos(angle2)
    y2 <- sin(angle2)
    samples2 <- cbind(x2, y2)
    samples2 <- samples2 + matrix(rnorm(2*ncells, sd = .05), ncells, 2)
    
    ## projection into high dim space
    A1 <- samples1 %*% t(proj)    
    A2 <- samples2 %*% t(proj)

    ## add gene-specific batch noise
    batch_mat1 <- matrix(rep(rnorm(ngenes, sd = batch_sd), each=ncells), ncol=ngenes)
    batch_mat2 <- matrix(rep(rnorm(ngenes, sd = batch_sd), each=ncells), ncol=ngenes)

    A1 <- A1 + batch_mat1    
    A2 <- A2 + batch_mat2
    
    ## A1, A2: cells x genes
    pca_res <- rbind(A1, A2) %>% cosine_normalize(2) %>% t() %>% svd() ## norm genes only 
    V <- pca_res$v %*% diag(pca_res$d)
    
    return(list(X = rbind(samples1, samples2), 
                V = V, 
                meta_data = data.table(position = c(angle1, angle2),
                                       batch = rep(c('alpha', 'bravo'), c(ncells, ncells))))
          )
}

To see how this works, let’s simulate two datasets, one without and with batch effect.

simdata1 <- simulate_trajectory(ncells = 500, ngenes = 20,
    starts = c(0, 0), 
    ends = c(pi / 2, pi / 2),
    betaAs = c(1, 1),
    betaBs = c(1, 1),
    batch_sd=0,
    seed=1
)

simdata2 <- simulate_trajectory(ncells = 500, ngenes = 20,
    starts = c(0, 0), 
    ends = c(pi / 2, pi / 2),
    betaAs = c(1, 1),
    betaBs = c(1, 1),
    batch_sd=.5,
    seed=1
)

simdata1$V %>% data.frame() %>% 
    do_scatter(simdata1$meta_data, 'batch', do_points = TRUE, do_density = FALSE) + 
    labs(title = 'Without batch effect', x = 'PC1', y = 'PC2') +

simdata2$V %>% data.frame() %>% 
    do_scatter(simdata2$meta_data, 'batch', do_points = TRUE, do_density = FALSE) + 
    labs(title = 'With batch effect', x = 'PC1', y = 'PC2')

Another important feature to simulate is density along the trajectory. In order to vary this, we define a beta distribution for the trajectory position (i.e. angle to x-axis). If we set beta to have parameters (1, 1), we achieves a uniform density. We can play with the parameters to create imbalanced trajectories and simulate rare transition types. We provide a few examples of these below.

\(angle \sim \frac{\pi}{2} Beta(\alpha=0.1, \beta=0.1)\)

## density concentrated towards the ends
simdata <- simulate_trajectory(ncells = 500, ngenes = 20,
    starts = c(0, 0), 
    ends = c(pi / 2, pi / 2),
    betaAs = c(.1, .1),
    betaBs = c(.1, .1),
    batch_sd=.5,
    seed=1
)

simdata$V %>% data.frame() %>% 
    do_scatter(simdata$meta_data, 'batch', do_points = TRUE, do_density = FALSE) + 
    labs(title = 'After high dim projection', subtitle = 'After adding batch', x = 'PC1', y = 'PC2')

\(angle \sim \frac{\pi}{2} Beta(\alpha=0.3, \beta=1)\)

## density concentrated towards the ends
simdata <- simulate_trajectory(ncells = 500, ngenes = 20,
    starts = c(0, 0), 
    ends = c(pi / 2, pi / 2),
    betaAs = c(.3, .3),
    betaBs = c(1, 1),
    batch_sd=.5,
    seed=1
)

simdata$V %>% cosine_normalize(1) %>% data.frame() %>% 
    do_scatter(simdata$meta_data, 'batch', do_points = TRUE, do_density = FALSE) + 
    labs(title = 'After high dim projection', subtitle = 'After adding batch', x = 'PC1', y = 'PC2')

2.1 Soft clusters model smooth trajectory

In this section, let’s see how Harmony deals with integrating trajectories in which both datasets have the full length of the trajectory.

## density concentrated towards the ends
simdata <- simulate_trajectory(ncells = 500, ngenes = 20,
    starts = c(0, 0), 
    ends = c(pi / 2, pi / 2),
    betaAs = c(1, 1),
    betaBs = c(1, 1),
    batch_sd=.5,
    seed=1
)

harmonyObj <- harmony::HarmonyMatrix(
    simdata$V, theta = 1, lambda = 1, do_pca = FALSE, 
    meta_data = simdata$meta_data, 
    vars_use = 'batch', nclust = 10, 
    max.iter.harmony = 10, max.iter.cluster = 10, 
    return_object = TRUE, verbose = FALSE
)

do_scatter(t(harmonyObj$Z_orig), simdata$meta_data, 'batch', no_guides = TRUE, do_labels = FALSE) + 
    labs(x = 'PC1', y = 'PC2', title = 'Original Space') + 
do_scatter(t(harmonyObj$Z_corr), simdata$meta_data, 'batch', no_guides = FALSE, do_labels = FALSE) + 
    labs(x = 'Harmony1', y = 'Harmony2', title = 'Corrected Space') + 
NULL

It looks like Harmony worked here. Let’s take a peek into the soft cluster assignments.

clust_order <- hclust(as.dist(cor(t(harmonyObj$R)))) %>% with(order(order))


clusters_df <- harmonyObj$R %>% t() %>% data.table() %>% 
    tibble::rowid_to_column('id') %>%
    tidyr::gather(cluster, r, -id) %>% 
    dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)) %>% 
    dplyr::mutate(cluster = factor(cluster, paste0('Cluster ', clust_order)))

First, let’s see them in a scatterplot with uncorrected PC1 and PC2. Color denotes the probability that a cell is assigned to that cluster. We can see two things here. First, some clusters are redundant (e.g. 2, 3, and 9). Second, There seems to be some smooth overlap between non-redundant clusters (e.g. 9 and 4).

t(harmonyObj$Z_orig) %>% data.frame() %>%
    cbind(simdata$meta_data) %>% 
    tibble::rowid_to_column('id') %>% 
    dplyr::inner_join(clusters_df, by = 'id') %>% 
    dplyr::sample_frac(1L) %>% 
    ggplot(aes(X1, X2, color = r)) + 
        geom_point(shape = '.') + 
        theme_tufte(base_size = 12) + theme(panel.background = element_rect()) + 
        facet_wrap(~cluster, nrow = 2) + 
        scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) + 
        labs(x = 'PC1', y = 'PC2', title = 'Probabilistic cluster assignments')

To visualize the smooth overlap in non-redundant clusters better, let’s take a look at the heatmap of the clusters’ correlation.

pheatmap::pheatmap(cor(t(harmonyObj$R)), 
                   labels_col = paste0('Cluster ', 1:10),
                   labels_row = paste0('Cluster ', 1:10)
                  )

Finally, we can visualize this smooth overlap in a third way. The plots below summarize the assignments (y-axis) of clusters over trajectory position (x-axis). Again, we see that clusters 2, 3, and 9 are redundant. However, now it is clearer that clusters 2 and 4 share some overlapping region of the trajectory.

clusters_df %>% 
    dplyr::inner_join(simdata$meta_data %>% tibble::rowid_to_column('id'), by = 'id') %>% 
    ggplot(aes(position, 100 * r)) + 
        geom_smooth(color = 'black', method = 'loess') + 
        facet_grid(cluster ~ batch, scales = 'free') + 
        labs(x = 'Trajectory position', y = 'Probability') + 
        theme_tufte(base_size = 14) + theme(panel.background = element_rect())

2.2 Rare transition types

How does Harmony deal with rare transition types? There is certainly the danger that Harmony will ignore these cells and split the trajectory into 2 discrete groups. Let’s see if that happens!

## density concentrated towards the ends
simdata <- simulate_trajectory(ncells = 500, ngenes = 20,
    starts = c(0, 0), 
    ends = c(pi / 2, pi / 2),
    betaAs = c(.1, .1),
    betaBs = c(.1, .1),
    batch_sd=.5,
    seed=1
)

harmonyObj <- harmony::HarmonyMatrix(
    simdata$V, ## PCA embedding matrix of cells
    theta = 1, 
    lambda = 1, 
    meta_data = simdata$meta_data, ## dataframe with cell labels
    vars_use = 'batch', ## (list of) variable(s) we'd like to Harmonize out
    nclust = 10, ## number of clusters in Harmony model
    max.iter.harmony = 10, do_pca = FALSE, 
    max.iter.cluster = 10, 
    return_object = TRUE, ## return the full Harmony model object, not just the corrected PCA matrix
    verbose = FALSE
)

In fact, it seems that Harmony models the rare and dense regions of space and merges accordingly.

do_scatter(t(harmonyObj$Z_orig), simdata$meta_data, 'batch', no_guides = TRUE, do_labels = FALSE) + 
    labs(x = 'PC1', y = 'PC2', title = 'Original Space') + 
do_scatter(t(harmonyObj$Z_corr), simdata$meta_data, 'batch', no_guides = FALSE, do_labels = FALSE) + 
    labs(x = 'Harmony1', y = 'Harmony2', title = 'Corrected Space') + 
NULL

2.3 Pitfall: merging incomplete trajectories

While rare transition types seem to be handled robustly, there is another situation that Harmony does not handle well. Here, batch alpha has the full trajectory but batch B only has about 1/5 of the trajectory.

## density concentrated towards the ends
simdata <- simulate_trajectory(ncells = 500, ngenes = 20,
    starts = c(0, 0), 
    ends = c(pi / 2, pi / 2),
    betaAs = c(1, 1),
    betaBs = c(1, 1),
    batch_sd=.5,
    seed=1
)

idx_use <- which(simdata$meta_data$batch == 'alpha' | simdata$meta_data$position < .1 * pi)
simdata$meta_data <- simdata$meta_data[idx_use, ]
simdata$V <- simdata$V[idx_use, ]
simdata$X <- simdata$X[idx_use, ]
table(simdata$meta_data$batch)

## 
## alpha bravo 
##   500    96

harmonyObj <- harmony::HarmonyMatrix(simdata$V, simdata$meta_data, vars_use = 'batch', do_pca = FALSE, verbose = FALSE, 
                                     theta = 1, nclust = 10, max.iter.harmony = 10, max.iter.cluster = 10,  
                                     return_object = TRUE)

With these incomplete trajectories, Harmony moves the corresponding cells from both batches towards each other to merge them correctly. However, it leavs the rest of the cells from batch alpha behind! As a result, the smooth trajectory in batch alpha is now broken into two clusters.

do_scatter(t(harmonyObj$Z_orig), simdata$meta_data, 'batch', no_guides = TRUE, do_labels = FALSE) + 
    labs(x = 'PC1', y = 'PC2', title = 'Original Space') + 
do_scatter(t(harmonyObj$Z_corr), simdata$meta_data, 'batch', no_guides = FALSE, do_labels = FALSE) + 
    labs(x = 'Harmony1', y = 'Harmony2', title = 'Corrected Space') + 
NULL

2.4 Reference mapping

Perhaps a better correction would be to leave the alpha batch alone and only move cells from the bravo batch. This situation is akin to reference mapping. That is, we label batch alpha as a reference batch and don’t move it. We then map all other batches (here, only bravo) to the reference. Harmony now has a feature that allows you to do just this! All you need to do is specify reference_values=c(‘alpha’) and now alpha cells will not be moved.

harmonyObj <- harmony::HarmonyMatrix(simdata$V, simdata$meta_data, vars_use = 'batch', do_pca = FALSE, verbose = FALSE, 
                                     theta = 1, nclust = 10, max.iter.harmony = 10, max.iter.cluster = 10,  
                                     return_object = TRUE, reference_values = c('alpha'))

Now we merged the bravo cells correctly without distorting the alpha batch.

do_scatter(t(harmonyObj$Z_orig), simdata$meta_data, 'batch', no_guides = TRUE, do_labels = FALSE) + 
    labs(x = 'PC1', y = 'PC2', title = 'Original Space') + 
do_scatter(t(harmonyObj$Z_corr), simdata$meta_data, 'batch', no_guides = FALSE, do_labels = FALSE) + 
    labs(x = 'Harmony1', y = 'Harmony2', title = 'Corrected Space') + 
NULL

This is not a solution for all such cases. In particular, we needed to know that the alpha batch is complete while the bravo one is not. This is an important limitation of Harmony that needs to be addressed in future development.

3 Session Info

sessionInfo()

## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] harmony_0.1.0      Rcpp_1.0.1         ggrepel_0.8.0.9000
##  [4] lme4_1.1-21        Matrix_1.2-17      pheatmap_1.0.12   
##  [7] patchwork_0.0.1    forcats_0.4.0      stringr_1.4.0     
## [10] dplyr_0.8.0.1      purrr_0.3.2        readr_1.3.1       
## [13] tidyr_0.8.3        tibble_2.1.1       ggplot2_3.1.0     
## [16] tidyverse_1.2.1    openxlsx_4.1.0     ggthemes_4.1.0    
## [19] data.table_1.12.0  BiocStyle_2.10.0  
## 
## loaded via a namespace (and not attached):
##  [1] lubridate_1.7.4    lattice_0.20-38    assertthat_0.2.1  
##  [4] digest_0.6.18      R6_2.4.0           cellranger_1.1.0  
##  [7] plyr_1.8.4         backports_1.1.3    evaluate_0.13     
## [10] httr_1.4.0         pillar_1.3.1       rlang_0.3.3       
## [13] lazyeval_0.2.2     readxl_1.3.1       rstudioapi_0.10   
## [16] minqa_1.2.4        nloptr_1.2.1       rmarkdown_1.12    
## [19] labeling_0.3       splines_3.5.0      munsell_0.5.0     
## [22] broom_0.5.1        compiler_3.5.0     modelr_0.1.4      
## [25] xfun_0.6           pkgconfig_2.0.2    htmltools_0.3.6   
## [28] tidyselect_0.2.5   bookdown_0.9       codetools_0.2-16  
## [31] crayon_1.3.4       withr_2.1.2        MASS_7.3-51.4     
## [34] grid_3.5.0         nlme_3.1-137       jsonlite_1.6      
## [37] gtable_0.3.0       magrittr_1.5       scales_1.0.0      
## [40] zip_2.0.1          cli_1.1.0          stringi_1.4.3     
## [43] reshape2_1.4.3     xml2_1.2.0         generics_0.0.2    
## [46] boot_1.3-20        RColorBrewer_1.1-2 tools_3.5.0       
## [49] glue_1.3.1         hms_0.4.2          yaml_2.2.0        
## [52] colorspace_1.4-1   BiocManager_1.30.4 rvest_0.3.2       
## [55] knitr_1.22         haven_2.1.0

Simulated dataset integration

9 April 2019

Contents