#!/usr/bin/env Rscript # define RPKM cutoffs for a gene to be considered (non-)expressed low_fpkm <- 3 high_fpkm <- 13 # parse command-line arguments args <- commandArgs(trailingOnly=T) if (length(args) < 3 || length(args) > 4) stop("Usage: expression_comparison.R cup_sample_id fpkms_CUP_reference_validation.tsv.gz results_file.tsv [exclude_liver_genes.txt]") # load FPKM values fpkms_file <- args[2] fpkms <- read.table(fpkms_file, header=T, sep="\t", row.names=NULL) rownames(fpkms) <- fpkms$gene fpkms <- fpkms[,colnames(fpkms) != "gene"] # drop genes to ignore ignore_genes <- rep(F, nrow(fpkms)) if (length(args) == 4) ignore_genes <- rownames(fpkms) %in% read.table(args[4], header=F)$V1 fpkms <- fpkms[apply(fpkms, 1, function(x) {any(x > high_fpkm)}) & !ignore_genes,] # get FPKM values of CUP sample and remove non-reference samples cup_sample <- gsub("-", ".", args[1], perl=T) if (!(cup_sample %in% colnames(fpkms))) stop("Invalid CUP sample ID") cup_sample_fpkms <- fpkms[,cup_sample] fpkms <- fpkms[,grep("^Ref", colnames(fpkms))] # for all possible pairs of samples in the reference cohort # - find genes that are expressed at a high level in sample1 and a low level in sample2 and vice versa => marker genes # - calculate the fraction of genes that are high-expressed in the CUP sample *and* in sample1 # - calculate the fraction of genes that are high-expressed in the CUP sample *and* in sample2 # - the difference between these fractions is the score (negative score means more similar) # - store the score in a N x N matrix comparison <- matrix(NA, nrow=ncol(fpkms), ncol=ncol(fpkms)) colnames(comparison) <- colnames(fpkms) rownames(comparison) <- colnames(fpkms) for (sample1 in rownames(comparison)) { for (sample2 in colnames(comparison)) { if (which(rownames(comparison) == sample1) < which(colnames(comparison) == sample2)) { # the matrix is symmetrical => skip comparisons above the diagonal total1 <- fpkms[,sample1] > high_fpkm & fpkms[,sample2] < low_fpkm total2 <- fpkms[,sample2] > high_fpkm & fpkms[,sample1] < low_fpkm if (any(total1) && any(total2)) { genes1 <- sum(total1 & cup_sample_fpkms > high_fpkm) genes2 <- sum(total2 & cup_sample_fpkms > high_fpkm) comparison[sample1, sample2] <- genes1/sum(total1) - genes2/sum(total2) comparison[sample2, sample1] <- genes2/sum(total2) - genes1/sum(total1) } } } } # for each sample count how often it had a negative score (i.e., the CUP sample was more similar to this sample) similarityCount <- apply(comparison, 2, function(x) sum(x<0,na.rm=T)) # scale the scores for better visualization of differences in the heatmap similarityScore <- 0.5-0.5*apply(comparison, 2, function (x) mean(x, na.rm=T)) # rank samples by the number of comparisons won; break ties by similarity score ranks <- order(similarityCount*1000 + similarityScore, decreasing=T) comparison <- comparison[ranks,ranks] write.table(data.frame(Sample=colnames(comparison), SimilarityCount=similarityCount[ranks], SimilarityScore=similarityScore[ranks]), args[3], quote=F, row.names=F, sep="\t")