"""Relevant Kalman filter functions. Includes: simulate_lognormal_replicates(): Generates a set of replicates given a prescribed noise level under the lognormal noise model. kf(): A simplified 1D implementation of the Kalman filter. run_kf_RNA(): Performs the KF on each nucleotide of an RNA with input replicate profiles and returns the filtered profile. """ import numpy as np def simulate_lognormal_replicates(profile, N_reps=3, sd_min=0.5, sd_max=1.1, sd_in=[]): """Return N simulated replicates under the log-normal noise model for a provided reactivity profile. Profile is transformed to the log domain. Gaussian noise is simulated to generate N_reps replicates. Resulting replicates are transformed back to the data domain. -999 used to indicate missing data. Args: profile: [1xN] array. Original reactivity profile without noise (i.e. ground truth). This is in the data domain (assumed). Values of profile that are missing are ignored (i.e. -999/missing in profile). N_reps: The number of replicates to be simulated. sd_min, sd_max: Parameters to define the uniform distribution of noise levels i.e. Uniform(a,b) from which standard deviations are selected for each nucleotide. If sd_min==sd_max, the noise levels for all nucleotides are the same. Values are ignored if sd_in array provided. These represent log domain standard deviations. sd_in: [1xN] array indicating noise standard deviations for each nucleotide in RNA sequence. These represent log domain standard deviations. Returns: replicates: [N_reps x N] array. Replicate reactivity profiles generated according to the noise levels (either sd_min/sd_max, or the sd_in array). Replicates are returned in the data domain. sd: Noise levels per nucleotide (selected from Uniform(a, b) or input sd_in). This is the standard deviation of the Gaussian noise added in the log domain. """ # Number of nucleotides in RNA sequence N = len(profile) # If an array of standard deviation values is not provided. if sd_in == []: if (sd_min == sd_max): # Variances are the same for all nucleotides. sd_out = [sd_min for i in range(N)] else: # Uniformly sample values between a and b to be SD values. sd_out = np.random.uniform(sd_min, sd_max, size=N) else: # Use the input array of standard deviation values. sd_out = sd_in # Transform profile to log domain log_profile = np.log(profile) # Initialize log domain replicates log_replicates = np.zeros((N_reps, N)) # Add noise to input log_replicates. for j in range(N_reps): for idx in range(N): if sd_out [idx] == 0: log_replicates[j][idx] =log_profile[idx] else: # Profile input in data domain #-999 indicates missing data in SHAPE domain if profile[idx] > -500: log_replicates[j][idx] = log_profile[idx] + np.random.normal(0, sd_out[idx], 1) else: log_replicates[j][idx] = log_profile[idx] # Revert to data domain. Replace NaN with -999. replicates = np.asarray([[np.exp(xx) if not np.isnan(xx) else -999 for xx in rep] for rep in log_replicates]) return replicates, sd_out def kf(measurements, prior, R=[], Q=0): """Perform the Kalman filter on input measurements. Args: measurements: [1xN] Array of measurements. inf and NaN measurements are ignored. prior: (mean, standard_deviation) of Gaussian distribution used as prior in KF. R: Measurement noise variance. If not provided, sample variance of measurements array used. Q: Process noise variance. Returns: x_hat: Final KF estimate. P: Variance of final KF estimate. """ # Initial guess of x is the mean of the prior distribution x_hat = prior[0] # Initial guess of x variance (variance of the prior) P = prior[1] ** 2 # Variance in the measurements if R == []: R = np.std(measurements, ddof=1) ** 2 if R == 0: # If there is no variance in the measurements, return measurement. return measurements[0], R # Otherwise, continue with Kalman filter for measurement in measurements: if not np.isinf(measurement) and not np.isnan(measurement): # Predict step P = P + Q # Kalman gain K = P / float(P + R) # Update prediction x_hat = x_hat + K * (measurement - x_hat) P = (1 - K) * P return x_hat, P def run_kf_RNA(replicates, prior, R=[]): """Perform the Kalman filter on and RNA. For each nucleotide, calls the kf() routine. Args: replicates: [N_reps x N] Array of replicates. Missing values (-999) are ignored. N_reps is the number of replicates and N the length of the RNA. Input in the data domain. prior: (mean, standard_deviation) of Gaussian distribution used as prior in KF. Note KF is run in the log domain so this prior is meant to be on the log measurement distribution. R: [1 x N]Array of Log measurement standard deviation values. If not provided, sample standard deviation of log measurements used for each nucleotide. Returns: kf_profile: [1 x N] Array of the final (data domain) profile array. Returned in the data domain. NOTE: Kalman filter is performed in the log domain. Measurements (i.e. from replicates) are log transformed before filter is run on each nucleotide. """ # Length of RNA N = len(replicates[0]) # Number of replicates N_reps = len(replicates) # Initialize kf_profile kf_profile = np.zeros(N) # Run kf() for each nucleotide. for idx in range(N): if not R == []: R_idx = R[idx] else: R_idx = R measurements = [rep[idx] for rep in replicates] measurements_log = np.log(measurements) k, P_arr = kf(measurements_log, prior, R=R_idx) kf_profile[idx] = np.exp(k) return kf_profile