#!/usr/bin/env Rscript

# define RPKM cutoffs for a gene to be considered (non-)expressed
low_fpkm <- 3
high_fpkm <- 13

# parse command-line arguments
args <- commandArgs(trailingOnly=T)
if (length(args) < 3 || length(args) > 4)
	stop("Usage: expression_comparison.R cup_sample_id fpkms_CUP_reference_validation.tsv.gz results_file.tsv [exclude_liver_genes.txt]")

# load FPKM values
fpkms_file <- args[2]
fpkms <- read.table(fpkms_file, header=T, sep="\t", row.names=NULL)
rownames(fpkms) <- fpkms$gene
fpkms <- fpkms[,colnames(fpkms) != "gene"]

# drop genes to ignore
ignore_genes <- rep(F, nrow(fpkms))
if (length(args) == 4)
	ignore_genes <- rownames(fpkms) %in% read.table(args[4], header=F)$V1
fpkms <- fpkms[apply(fpkms, 1, function(x) {any(x > high_fpkm)}) & !ignore_genes,]

# get FPKM values of CUP sample and remove non-reference samples
cup_sample <- gsub("-", ".", args[1], perl=T)
if (!(cup_sample %in% colnames(fpkms)))
	stop("Invalid CUP sample ID")
cup_sample_fpkms <- fpkms[,cup_sample]
fpkms <- fpkms[,grep("^Ref", colnames(fpkms))]

# for all possible pairs of samples in the reference cohort
# - find genes that are expressed at a high level in sample1 and a low level in sample2 and vice versa => marker genes
# - calculate the fraction of genes that are high-expressed in the CUP sample *and* in sample1
# - calculate the fraction of genes that are high-expressed in the CUP sample *and* in sample2
# - the difference between these fractions is the score (negative score means more similar)
# - store the score in a N x N matrix
comparison <- matrix(NA, nrow=ncol(fpkms), ncol=ncol(fpkms))
colnames(comparison) <- colnames(fpkms)
rownames(comparison) <- colnames(fpkms)
for (sample1 in rownames(comparison)) {
	for (sample2 in colnames(comparison)) {
		if (which(rownames(comparison) == sample1) < which(colnames(comparison) == sample2)) { # the matrix is symmetrical => skip comparisons above the diagonal
			total1 <- fpkms[,sample1] > high_fpkm & fpkms[,sample2] < low_fpkm
			total2 <- fpkms[,sample2] > high_fpkm & fpkms[,sample1] < low_fpkm
			if (any(total1) && any(total2)) {
				genes1 <- sum(total1 & cup_sample_fpkms > high_fpkm)
				genes2 <- sum(total2 & cup_sample_fpkms > high_fpkm)
				comparison[sample1, sample2] <- genes1/sum(total1) - genes2/sum(total2)
				comparison[sample2, sample1] <- genes2/sum(total2) - genes1/sum(total1)
			}
		}
	}
}

# for each sample count how often it had a negative score (i.e., the CUP sample was more similar to this sample)
similarityCount <- apply(comparison, 2, function(x) sum(x<0,na.rm=T))
# scale the scores for better visualization of differences in the heatmap
similarityScore <- 0.5-0.5*apply(comparison, 2, function (x) mean(x, na.rm=T))
# rank samples by the number of comparisons won; break ties by similarity score
ranks <- order(similarityCount*1000 + similarityScore, decreasing=T)
comparison <- comparison[ranks,ranks]
write.table(data.frame(Sample=colnames(comparison), SimilarityCount=similarityCount[ranks], SimilarityScore=similarityScore[ranks]), args[3], quote=F, row.names=F, sep="\t")