# make data
n = 10000
dim_X = 2
dim_Y = 3
Cx = np.random.randn(dim_X, dim_X)
X = np.random.multivariate_normal(np.zeros(dim_X), Cx, n)
E = np.eye(dim_X)
E[dim_X//2:] = 0
N = np.random.randn(n, dim_X)
F = np.random.randn(dim_X, dim_Y)
Y = (X @ E + N) @ F
# JRR
n_splits = 10
H = np.zeros((dim_X, dim_X))
for _ in range(n_splits):
# split data
p = np.random.permutation(range(n))
i, j = p[::2], p[1::2]
# first regression
G = pinv(Y[i].T @ Y[i]) @ Y[i].T @ X[i]
# second regression
H += pinv(X[j].T @ X[j]) @ X[j].T @ Y[j] @ G
H /= n_splits
Ehat = np.diag(H)
print(np.diag(E), Ehat)
It seems that in practice it may stabilize things when simply accumulate the Yj G with splitting but inverse the second cov_X outside:
# JRR
n_splits = 10
XYG = np.zeros((dim_X, dim_X))
for _ in range(n_splits):
# split data
p = np.random.permutation(range(n))
i, j = p[::2], p[1::2]
# first regression
G = pinv(Y[i].T @ Y[i]) @ Y[i].T @ X[i]
# second regression
XYG += X[j].T @ Y[j] @ G
XYG /= n_splits
H = pinv(X.T @ X) @ XYG
Ehat = np.diag(H)
print(np.diag(E), Ehat)
Currently we fully separate the two regression
It seems that in practice it may stabilize things when simply accumulate the
Yj G
with splitting but inverse the secondcov_X
outside:Is it ok to do it?