"""
Supplementary Note 5: Compare gene expression levels

Authors: Eugene Oh, Annemarie Becker

inputFileP1:
read densities per gene on plus strand for sample 1 (Supplementary Note 4)
    col0: gene name
    col1: start coordinate of gene
    col2: stop coordinate of gene
    col3: RPKM-normalized sum of read densities

inputFileM1:
read densities per gene on minus strand for sample 1 (Supplementary Note 4)
    col0: gene name
    col1: start coordinate of gene
    col2: stop coordinate of gene
    col3: RPKM-normalized sum of read densities

inputFileP2:
read densities per gene on plus strand for sample 2 (Supplementary Note 4)
    col0: gene name
    col1: start coordinate of gene
    col2: stop coordinate of gene
    col3: RPKM-normalized sum of read densities

inputFileM2:
read densities per gene on minus strand for sample 2 (Supplementary Note 4)
    col0: gene name
    col1: start coordinate of gene
    col2: stop coordinate of gene
    col3: RPKM-normalized sum of read densities

inputNumber1:
total read number as float (Supplementary Note 3)

inputNumber2:
total read number as float (Supplementary Note 3)

outputFile: 
RPKM-normalized comparison of gene expression levels from samples 1 and 2
    col0: gene name
    col1: start coordinate of gene
    col2: stop coordinate of gene
    col3: RPKM-normalized gene expression levels for sample 1
    col4: RPKM-normalized gene expression levels for sample 2
    
"""


def expression(inputFileP1, inputFileM1, inputFileP2, inputFileM2, inputNumber1, inputNumber2, outputFile):

  # Upload input files for sample 1
    
    list1 = []
    
    inFile = open(inputFileP1, 'r')
    line = inFile.readline()
    while line != '':
        fields = line.split()
        list1.append(fields)
        line = inFile.readline()
        
    inFile = open(inputFileM1, 'r')
    line = inFile.readline()
    while line != '':
        fields = line.split()
        list1.append(fields)	#add minus strand input file to the same list
        line = inFile.readline()
    
  # Upload input files for sample 2
    
    list2 = []
    
    inFile = open(inputFileP2, 'r')
    line = inFile.readline()
    while line != '':
        fields = line.split()
        list2.append(fields)
        line = inFile.readline()
        
    inFile = open(inputFileM2, 'r')
    line = inFile.readline()
    while line != '':
        fields = line.split()
        list2.append(fields)
        line = inFile.readline()

  # Compile both lists
        
    listA = zip(list1, list2)
        
  # Normalize read densities per gene by RPKM metric using a comparison cutoff value
  
    listB = []
  
    import math
    
    inNumber1 = open(inputNumber1, 'r')
    number1 = inNumber1.readline()
    totalReads1 = int(float(number1))

    inNumber2 = open(inputNumber2, 'r')
    number2 = inNumber2.readline()
    totalReads2 = int(float(number2))

    for Z in listA:			#Z = line
        val1 = float(Z[0][3])		#0th dimension (list1), 3rd column
        val2 = float(Z[1][3])		#1th dimension (list2), 3rd column
        gene_sum = val1 + val2		#sum of reads per gene from both experiments
        if gene_sum > 100:		#take only genes with at least 100 reads in both experiments --> this value has to be determined by variability analysis
            col0 = str(Z[0][0])		#gene name (of list1)
            col1 = int(Z[0][1])		#start position (of list1)
            col2 = int(Z[0][2])		#stop position (of list1)
            length = abs(col1 - col2) + 1   #gene length
            col3 = val1 / totalReads1 * 1000000 / length * 1000
            col4 = val2 / totalReads2 * 1000000 / length * 1000
            fields = (col0, col1, col2, col3, col4)
            listB.append(fields)
            
  # Output file
    
    outFile = open(outputFile, 'w')
    for J in listB:
        for K in range(5):
            outFile.write(str(J[K]))
            if K < 4:
                outFile.write('\t')
        outFile.write('\n')
        


if __name__ == '__main__':
    inputFileP1 = ''
    inputFileM1 = ''
    inputFileP2 = ''
    inputFileM2 = ''
    inputNumber1 = ''
    inputNumber2 = ''
    outputFile = ''
    
    expression(inputFileP1, inputFileM1, inputFileP2, inputFileM2, inputNumber1, inputNumber2, outputFile)