astheeggeggs / lshmm

code to run Li and Stephens
MIT License
3 stars 3 forks source link

Discrepant pointer matrices from `forwards_viterbi_hap_naive_low_mem_rescaling` and `forwards_viterbi_hap_low_mem_rescaling` #32

Closed szhan closed 3 months ago

szhan commented 3 months ago

While I was developing some tests, I encountered the following discrepancy in the pointer matrices from forwards_viterbi_hap_naive_low_mem_rescaling and forwards_viterbi_hap_low_mem_rescaling.

import numpy as np
np.set_printoptions(linewidth=1000)

MISSING = -1

def get_emission_probabilities(m, p_mutation, n_alleles):
    # Note that this is different than `set_emission_probabilities` in `api.py`.
    # No scaling.
    e = np.zeros((m, 2))
    for j in range(m):
        e[j, 0] = p_mutation[j] / (n_alleles[j] - 1)
        e[j, 1] = 1 - p_mutation[j]
    return e

# Copied from:
# https://github.com/astheeggeggs/lshmm/blob/f94dd055fc3a68366950a7806d9521232fcef07e/lshmm/vit_haploid.py#L24C1-L37C1
def viterbi_init(n, m, H, s, e, r):
    """Initialise naive, but more space memory efficient implementation of LS viterbi."""
    V_previous = np.zeros(n)
    V = np.zeros(n)
    P = np.zeros((m, n)).astype(np.int64)
    r_n = r / n

    for i in range(n):
        V_previous[i] = (
            1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]) or s[0, 0] == MISSING)]
        )

    return V, V_previous, P, r_n

# Copied from:
# https://github.com/astheeggeggs/lshmm/blob/f94dd055fc3a68366950a7806d9521232fcef07e/lshmm/vit_haploid.py#L115
def forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, s, e, r):
    """Naive implementation of LS haploid Viterbi algorithm, with reduced memory and rescaling."""
    # Initialise
    V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r)
    c = np.ones(m)

    for j in range(1, m):
        c[j] = np.amax(V_previous)
        V_previous *= 1 / c[j]
        for i in range(n):
            # Get the vector to maximise over
            v = np.zeros(n)
            for k in range(n):
                v[k] = (
                    e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)]
                    * V_previous[k]
                )
                if k == i:
                    v[k] *= 1 - r[j] + r_n[j]
                else:
                    v[k] *= r_n[j]
            P[j, i] = np.argmax(v)
            V[i] = v[P[j, i]]

        V_previous = np.copy(V)

    ll = np.sum(np.log10(c)) + np.log10(np.amax(V))

    return V, P, ll

# Copied from:
# https://github.com/astheeggeggs/lshmm/blob/f94dd055fc3a68366950a7806d9521232fcef07e/lshmm/vit_haploid.py#L147
def forwards_viterbi_hap_low_mem_rescaling(n, m, H, s, e, r):
    """LS haploid Viterbi algorithm, with reduced memory and exploits the Markov process structure."""
    # Initialise
    V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r)
    c = np.ones(m)

    for j in range(1, m):
        argmax = np.argmax(V_previous)
        c[j] = V_previous[argmax]
        V_previous *= 1 / c[j]
        V = np.zeros(n)
        for i in range(n):
            V[i] = V_previous[i] * (1 - r[j] + r_n[j])
            P[j, i] = i
            if V[i] < r_n[j]:
                V[i] = r_n[j]
                P[j, i] = argmax
            V[i] *= e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)]
        V_previous = np.copy(V)

    ll = np.sum(np.log10(c)) + np.log10(np.max(V))

    return V, P, ll

# Input data
H = np.array([
     #[ 0,  1,  0,  0,  0,  0, -2, -2, -2, -2],   # Ancestor 0
     #[-2, -2, -2, -2,  0,  0,  0,  1,  0,  0],   # Ancestor 1
     #[-2, -2, -2,  0,  0,  0,  0, -2, -2, -2],   # Ancestor 2
     #[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],   # Ancestor 3
     [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1],   # Sample 0
     [ 0,  1,  0,  1,  0,  1,  0,  1,  0,  1],   # Sample 1
     [ 1,  0,  1,  0,  1,  0,  1,  0,  1,  0],   # Sample 2
     [ 0,  0,  1,  1,  0,  0,  1,  1,  0,  0],   # Sample 3
     [ 1,  1,  0,  0,  1,  1,  0,  0,  1,  1],   # Sample 4
]).T

m = H.shape[0]  # Number of sites
n = H.shape[1]  # Number of ref. haplotypes
r = np.zeros(m, dtype=np.float64) + 0.20
p_mutation = np.zeros(m, dtype=np.float64) + 0.10

n_alleles = np.zeros(m, dtype=np.int64) + 2
e = get_emission_probabilities(m, p_mutation, n_alleles)

query = np.array([[ 0,  1,  0,  0,  0,  1,  1,  1,  1,  1]])

# Compare output
V_naive, P_naive, ll_naive = forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, query, e, r)
V_low_mem, P_low_mem, ll_low_mem = forwards_viterbi_hap_low_mem_rescaling(n, m, H, query, e, r)

assert np.allclose(V_naive, V_low_mem) # Close
assert ll_naive == ll_low_mem # Equal
assert np.array_equal(P_naive, P_low_mem) # Not equal

I got the following pointer matrices:

# P_naive
array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 1, 3, 4],
       [1, 1, 1, 1, 4],
       [0, 1, 2, 1, 4], # Differ
       [1, 1, 1, 3, 4],
       [0, 1, 1, 1, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 1, 3, 1],
       [0, 1, 2, 3, 4]])

# P_low_mem
array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 1, 3, 4],
       [1, 1, 1, 1, 4],
       [1, 1, 2, 1, 4], # Differ
       [1, 1, 1, 3, 4],
       [0, 1, 1, 1, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 1, 3, 1],
       [0, 1, 2, 3, 4]])

I don't quite follow the logic for assigning pointers in the low-memory implementation, so I'm not sure the source of this discrepancy. I think the pointer matrix from the naive implementation is right, but I'm not sure about the other implementation.

szhan commented 3 months ago

I consulted with @astheeggeggs. I checked the log-likelihood of the most likely paths from the two implementations, and they passed np.allclose. So, it seems that in this instance, there exists at least two equally likely paths, and different paths were chosen during the runs.