Discrepant pointer matrices from `forwards_viterbi_hap_naive_low_mem_rescaling` and `forwards_viterbi_hap_low_mem_rescaling`

While I was developing some tests, I encountered the following discrepancy in the pointer matrices from forwards_viterbi_hap_naive_low_mem_rescaling and forwards_viterbi_hap_low_mem_rescaling.

import numpy as np
np.set_printoptions(linewidth=1000)

MISSING = -1

def get_emission_probabilities(m, p_mutation, n_alleles):
    # Note that this is different than `set_emission_probabilities` in `api.py`.
    # No scaling.
    e = np.zeros((m, 2))
    for j in range(m):
        e[j, 0] = p_mutation[j] / (n_alleles[j] - 1)
        e[j, 1] = 1 - p_mutation[j]
    return e

# Copied from:
# https://github.com/astheeggeggs/lshmm/blob/f94dd055fc3a68366950a7806d9521232fcef07e/lshmm/vit_haploid.py#L24C1-L37C1
def viterbi_init(n, m, H, s, e, r):
    """Initialise naive, but more space memory efficient implementation of LS viterbi."""
    V_previous = np.zeros(n)
    V = np.zeros(n)
    P = np.zeros((m, n)).astype(np.int64)
    r_n = r / n

    for i in range(n):
        V_previous[i] = (
            1 / n * e[0, np.int64(np.equal(H[0, i], s[0, 0]) or s[0, 0] == MISSING)]
        )

    return V, V_previous, P, r_n

# Copied from:
# https://github.com/astheeggeggs/lshmm/blob/f94dd055fc3a68366950a7806d9521232fcef07e/lshmm/vit_haploid.py#L115
def forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, s, e, r):
    """Naive implementation of LS haploid Viterbi algorithm, with reduced memory and rescaling."""
    # Initialise
    V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r)
    c = np.ones(m)

    for j in range(1, m):
        c[j] = np.amax(V_previous)
        V_previous *= 1 / c[j]
        for i in range(n):
            # Get the vector to maximise over
            v = np.zeros(n)
            for k in range(n):
                v[k] = (
                    e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)]
                    * V_previous[k]
                )
                if k == i:
                    v[k] *= 1 - r[j] + r_n[j]
                else:
                    v[k] *= r_n[j]
            P[j, i] = np.argmax(v)
            V[i] = v[P[j, i]]

        V_previous = np.copy(V)

    ll = np.sum(np.log10(c)) + np.log10(np.amax(V))

    return V, P, ll

# Copied from:
# https://github.com/astheeggeggs/lshmm/blob/f94dd055fc3a68366950a7806d9521232fcef07e/lshmm/vit_haploid.py#L147
def forwards_viterbi_hap_low_mem_rescaling(n, m, H, s, e, r):
    """LS haploid Viterbi algorithm, with reduced memory and exploits the Markov process structure."""
    # Initialise
    V, V_previous, P, r_n = viterbi_init(n, m, H, s, e, r)
    c = np.ones(m)

    for j in range(1, m):
        argmax = np.argmax(V_previous)
        c[j] = V_previous[argmax]
        V_previous *= 1 / c[j]
        V = np.zeros(n)
        for i in range(n):
            V[i] = V_previous[i] * (1 - r[j] + r_n[j])
            P[j, i] = i
            if V[i] < r_n[j]:
                V[i] = r_n[j]
                P[j, i] = argmax
            V[i] *= e[j, np.int64(np.equal(H[j, i], s[0, j]) or s[0, j] == MISSING)]
        V_previous = np.copy(V)

    ll = np.sum(np.log10(c)) + np.log10(np.max(V))

    return V, P, ll

# Input data
H = np.array([
     #[ 0,  1,  0,  0,  0,  0, -2, -2, -2, -2],   # Ancestor 0
     #[-2, -2, -2, -2,  0,  0,  0,  1,  0,  0],   # Ancestor 1
     #[-2, -2, -2,  0,  0,  0,  0, -2, -2, -2],   # Ancestor 2
     #[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],   # Ancestor 3
     [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1],   # Sample 0
     [ 0,  1,  0,  1,  0,  1,  0,  1,  0,  1],   # Sample 1
     [ 1,  0,  1,  0,  1,  0,  1,  0,  1,  0],   # Sample 2
     [ 0,  0,  1,  1,  0,  0,  1,  1,  0,  0],   # Sample 3
     [ 1,  1,  0,  0,  1,  1,  0,  0,  1,  1],   # Sample 4
]).T

m = H.shape[0]  # Number of sites
n = H.shape[1]  # Number of ref. haplotypes
r = np.zeros(m, dtype=np.float64) + 0.20
p_mutation = np.zeros(m, dtype=np.float64) + 0.10

n_alleles = np.zeros(m, dtype=np.int64) + 2
e = get_emission_probabilities(m, p_mutation, n_alleles)

query = np.array([[ 0,  1,  0,  0,  0,  1,  1,  1,  1,  1]])

# Compare output
V_naive, P_naive, ll_naive = forwards_viterbi_hap_naive_low_mem_rescaling(n, m, H, query, e, r)
V_low_mem, P_low_mem, ll_low_mem = forwards_viterbi_hap_low_mem_rescaling(n, m, H, query, e, r)

assert np.allclose(V_naive, V_low_mem) # Close
assert ll_naive == ll_low_mem # Equal
assert np.array_equal(P_naive, P_low_mem) # Not equal

I got the following pointer matrices:

# P_naive
array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 1, 3, 4],
       [1, 1, 1, 1, 4],
       [0, 1, 2, 1, 4], # Differ
       [1, 1, 1, 3, 4],
       [0, 1, 1, 1, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 1, 3, 1],
       [0, 1, 2, 3, 4]])

# P_low_mem
array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 1, 3, 4],
       [1, 1, 1, 1, 4],
       [1, 1, 2, 1, 4], # Differ
       [1, 1, 1, 3, 4],
       [0, 1, 1, 1, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 1, 3, 1],
       [0, 1, 2, 3, 4]])

I don't quite follow the logic for assigning pointers in the low-memory implementation, so I'm not sure the source of this discrepancy. I think the pointer matrix from the naive implementation is right, but I'm not sure about the other implementation.

astheeggeggs / lshmm

Discrepant pointer matrices from `forwards_viterbi_hap_naive_low_mem_rescaling` and `forwards_viterbi_hap_low_mem_rescaling` #32