"""Relevant Kalman filter functions.
Includes:
    simulate_lognormal_replicates(): Generates a set of replicates
        given a prescribed noise level under the lognormal noise
        model.
    kf(): A simplified 1D implementation of the Kalman filter.
    run_kf_RNA(): Performs the KF on each nucleotide 
        of an RNA with input replicate profiles and returns
        the filtered profile.
"""
import numpy as np


def simulate_lognormal_replicates(profile, N_reps=3, sd_min=0.5, sd_max=1.1,
        sd_in=[]):
    """Return N simulated replicates under the log-normal noise model 
    for a provided reactivity profile.
    
    Profile is transformed to the log domain. Gaussian noise is simulated
    to generate N_reps replicates. Resulting replicates are transformed
    back to the data domain.

    -999 used to indicate missing data.

    Args:
        profile: [1xN] array. Original reactivity profile without noise
            (i.e. ground truth). This is in the data domain (assumed). Values
            of profile that are missing are ignored (i.e. -999/missing in profile).
        N_reps: The number of replicates to be simulated.
        sd_min, sd_max: Parameters to define the uniform distribution of
            noise levels i.e. Uniform(a,b) from which standard deviations are
            selected for each nucleotide. If sd_min==sd_max, the noise
            levels for all nucleotides are the same. Values are ignored if
            sd_in array provided. These represent log domain standard deviations.
        sd_in: [1xN] array indicating noise standard deviations for each
            nucleotide in RNA sequence. These represent log domain standard
            deviations.

    Returns:
        replicates: [N_reps x N] array. Replicate reactivity profiles generated
            according to the noise levels (either sd_min/sd_max, or the
            sd_in array). Replicates are returned in the data domain.
        sd: Noise levels per nucleotide (selected from Uniform(a, b)
            or input sd_in).
            This is the standard deviation of the Gaussian noise added
            in the log domain.

    """
    # Number of nucleotides in RNA sequence
    N = len(profile)


    # If an array of standard deviation values is not provided.
    if sd_in == []:
        if (sd_min == sd_max):
            # Variances are the same for all nucleotides.
            sd_out = [sd_min for i in range(N)]
        else:
            # Uniformly sample values between a and b to be SD values.
            sd_out = np.random.uniform(sd_min, sd_max, size=N)
    else:
        # Use the input array of standard deviation values.
        sd_out = sd_in

    # Transform profile to log domain
    log_profile = np.log(profile)

    # Initialize log domain replicates
    log_replicates = np.zeros((N_reps, N))
    
    # Add noise to input log_replicates.
    for j in range(N_reps):
        for idx in range(N):
            if sd_out [idx] == 0:
                log_replicates[j][idx] =log_profile[idx]
            else:
                # Profile input in data domain
                #-999 indicates missing data in SHAPE domain
                if profile[idx] > -500:
                    log_replicates[j][idx] = log_profile[idx] + np.random.normal(0, sd_out[idx], 1)
                else:
                    log_replicates[j][idx] = log_profile[idx]

    # Revert to data domain. Replace NaN with -999.
    replicates = np.asarray([[np.exp(xx) if not np.isnan(xx) else -999 for xx in rep] for rep in log_replicates])

    return replicates, sd_out

def kf(measurements, prior, R=[], Q=0):
    """Perform the Kalman filter on input measurements.

    Args:
        measurements: [1xN] Array of measurements. inf and NaN measurements
            are ignored.
        prior: (mean, standard_deviation) of Gaussian distribution
            used as prior in KF.
        R: Measurement noise variance. If not provided, sample variance of
            measurements array used.
        Q: Process noise variance.

    Returns:
        x_hat: Final KF estimate.
        P: Variance of final KF estimate.
    """

    # Initial guess of x is the mean of the prior distribution
    x_hat = prior[0]

    # Initial guess of x variance (variance of the prior)
    P = prior[1] ** 2

    # Variance in the measurements
    if R == []:
        R = np.std(measurements, ddof=1) ** 2
    if R == 0:
        # If there is no variance in the measurements, return measurement.
        return measurements[0], R

    # Otherwise, continue with Kalman filter
    for measurement in measurements:
        if not np.isinf(measurement) and not np.isnan(measurement):
            # Predict step
            P = P + Q

            # Kalman gain
            K = P / float(P + R)

            # Update prediction
            x_hat = x_hat + K * (measurement - x_hat)
            P = (1 - K) * P

    return x_hat, P

def run_kf_RNA(replicates, prior, R=[]):
    """Perform the Kalman filter on and RNA.

    For each nucleotide, calls the kf() routine.

    Args:
        replicates: [N_reps x N] Array of replicates. Missing values (-999)
            are ignored. N_reps is the number of replicates and N the length
            of the RNA. Input in the data domain.
        prior: (mean, standard_deviation) of Gaussian distribution
            used as prior in KF. Note KF is run in the log domain so this
            prior is meant to be on the log measurement distribution.
        R: [1 x N]Array of Log measurement standard deviation values.
            If not provided, sample standard deviation of log measurements
            used for each nucleotide.

    Returns:
        kf_profile: [1 x N] Array of the final (data domain) profile array.
            Returned in the data domain.

    NOTE: Kalman filter is performed in the log domain. Measurements (i.e.
    from replicates) are log transformed before filter is run on each
    nucleotide.
    """

    # Length of RNA
    N = len(replicates[0])

    # Number of replicates
    N_reps = len(replicates)

    # Initialize kf_profile
    kf_profile = np.zeros(N)

    # Run kf() for each nucleotide.
    for idx in range(N):
        if not R == []:
            R_idx = R[idx]
        else:
            R_idx = R
        measurements = [rep[idx] for rep in replicates]
        measurements_log = np.log(measurements)
        k, P_arr = kf(measurements_log, prior, R=R_idx)
        kf_profile[idx] = np.exp(k)

    return kf_profile