""" Supplementary Note 12: Meta-gene analysis from start codon Authors: Eugene Oh, Annemarie Becker inputFileP: read density file for plus strand (Supplementary Note 2) col0: position along genome col1: read density at that position inputFileM: read density file for minus strand (Supplementary Note 2) col0: position along genome col1: read density at that position inputListP: E. coli MC4100 gene list of the plus strand col0: gene name col1: start coordinate of gene col2: stop coordinate of gene inputListM E. coli MC4100 gene list of the minus strand col0: gene name col1: start coordinate of gene col2: stop coordinate of gene outputFile: average read density along an average transcript col0: position along the average transcript, counted from the start codon col1: mean read density at that position """ def metagene(inputFileP, inputFileM, inputListP, inputListM, outputFile): mylist = range(-50, 1501) rangeDict = dict([i, 0] for i in mylist) countDict = dict([i, 0] for i in mylist) ### Plus Strand ### # Upload plus strand data in dictionary DictP = {} inFile = open(inputFileP, 'r') line = inFile.readline() while line != '': fields = line.split() col0 = int(fields[0]) col1 = float(fields[1]) DictP[col0] = col1 line = inFile.readline() # Upload plus strand gene list inFile = open(inputListP, 'r') line = inFile.readline() while line != '': fields = line.split() col0 = str(fields[0]) #gene name col1 = int(fields[1]) #start of gene col2 = int(fields[2]) #stop of gene length = abs(col1 - col2) + 1 # Select genes if length > 400: #use genes longer than 400 nt; this value can be changed normVal = 0 for Z in range(col1, col2 + 1): if Z in DictP: normVal += DictP[Z] #determine expression level if normVal > 50: #continue if sum of reads is greater than 50; this can be changed if desired meanNorm = normVal / length #calculate read density (average reads per base) for K in range(-50, 1501): elem = col1 + K if elem in DictP and K <= length: reads = DictP[elem] / meanNorm #divide reads at one position by average number of reads per base for this gene rangeDict[K] += reads #sum up reads at that position from all genes for K in countDict: #how often was this position counted in the calculation if K <= length: countDict[K] += 1 elif length <= 400: pass line = inFile.readline() ### Minus Strand ### ## Upload minus strand data in dictionary DictM = {} inFile = open(inputFileM, 'r') line = inFile.readline() while line != '': fields = line.split() col0 = int(fields[0]) col1 = float(fields[1]) DictM[col0] = col1 line = inFile.readline() # Upload minus strand gene list inFile = open(inputListM, 'r') line = inFile.readline() while line != '': fields = line.split() col0 = str(fields[0]) col1 = int(fields[1]) col2 = int(fields[2]) length = abs(col1 - col2) + 1 # Select genes if length > 400: normVal = 0 for Z in range(col2, col1 + 1): if Z in DictM: normVal += DictM[Z] if normVal > 50: meanNorm = normVal / length for K in range(-50, 1501): elem = col1 - K if elem in DictM and K <= length: reads = DictM[elem] / meanNorm rangeDict[K] += reads for K in countDict: if K <= length: countDict[K] += 1 elif length <= 400: pass line = inFile.readline() ### Output data ### tupledlist1 = rangeDict.items() tupledlist1.sort() tupledlist2 = countDict.items() tupledlist2.sort() fullDict = {} zippedlist = zip(tupledlist1, tupledlist2) for elem in zippedlist: col0 = elem[0][0] #list0 col0 = position (K) col1 = elem[0][1] #list0 col1 = norm read number col2 = elem[1][1] #list1 col1 = how often was position counted fullDict[col0] = col1 / col2 #normalization2 # Finish output tupledlist = fullDict.items() tupledlist.sort() outFile = open(outputFile, 'w') for J in tupledlist: for K in range(2): outFile.write(str(J[K])) if K < 1: outFile.write('\t') outFile.write('\n') if __name__ == '__main__': inputFileP = '' inputFileM = '' inputListP = '' inputListM = '' outputFile = '' metagene(inputFileP, inputFileM, inputListP, inputListM, outputFile)