Open ClaudiaComito opened 9 months ago
Branch bugs/1289-_Bug_CI_matrix_fails created!
Some configurations of the test matrix complete with AssertionErrors. After removing test_rand
, test_randint
, and test_randn
, all jobs complete.
=================================== FAILURES ===================================
_____________________________ TestRandom.test_rand _____________________________
self = <heat.core.tests.test_random.TestRandom testMethod=test_rand>
def test_rand(self):
# int64 tests
# Resetting seed works
seed = 12345
ht.random.seed(seed)
a = ht.random.rand(2, 5, 7, 3, split=0)
self.assertEqual(a.dtype, ht.float32)
self.assertEqual(a.larray.dtype, torch.float32)
b = ht.random.rand(2, 5, 7, 3, split=0)
self.assertFalse(ht.equal(a, b))
ht.random.seed(seed)
c = ht.random.rand(2, 5, 7, 3, dtype=ht.float32, split=0)
self.assertTrue(ht.equal(a, c))
# Random numbers with overflow
ht.random.set_state(("Threefry", seed, 0xFFFFFFFFFFFFFFF0))
a = ht.random.rand(2, 3, 4, 5, split=0)
ht.random.set_state(("Threefry", seed, 0x10000000000000000))
b = ht.random.rand(2, 44, split=0)
a = a.numpy().flatten()
b = b.numpy().flatten()
self.assertEqual(a.dtype, np.float32)
self.assertTrue(np.array_equal(a[32:], b))
# Check that random numbers don't repeat after first overflow
seed = 12345
ht.random.set_state(("Threefry", seed, 0x100000000))
a = ht.random.rand(2, 44)
ht.random.seed(seed)
b = ht.random.rand(2, 44)
self.assertFalse(ht.equal(a, b))
# Check that we start from beginning after 128 bit overflow
ht.random.seed(seed)
a = ht.random.rand(2, 34, split=0)
ht.random.set_state(("Threefry", seed, 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0))
b = ht.random.rand(2, 50, split=0)
a = a.numpy().flatten()
b = b.numpy().flatten()
self.assertTrue(np.array_equal(a, b[32:]))
# different split axis with resetting seed
ht.random.seed(seed)
a = ht.random.rand(3, 5, 2, 9, split=3)
ht.random.seed(seed)
c = ht.random.rand(3, 5, 2, 9, split=3)
self.assertTrue(ht.equal(a, c))
# Random values are in correct order
ht.random.seed(seed)
a = ht.random.rand(2, 50, split=0)
ht.random.seed(seed)
b = ht.random.rand(100, split=None)
a = a.numpy().flatten()
b = b.larray.cpu().numpy()
self.assertTrue(np.array_equal(a, b))
# On different shape and split the same random values are used
ht.random.seed(seed)
a = ht.random.rand(3, 5, 2, 9, split=3)
ht.random.seed(seed)
b = ht.random.rand(30, 9, split=1)
a = np.sort(a.numpy().flatten())
b = np.sort(b.numpy().flatten())
self.assertTrue(np.array_equal(a, b))
# One large array does not have two similar values
a = ht.random.rand(11, 15, 3, 7, split=2)
a = a.numpy()
_, counts = np.unique(a, return_counts=True)
# Assert that no value appears more than once
self.assertTrue((counts == 1).all())
# Two large arrays that were created after each other don't share any values
b = ht.random.rand(14, 7, 3, 12, 18, 42, split=5, comm=ht.MPI_WORLD, dtype=ht.float64)
c = np.concatenate((a.flatten(), b.numpy().flatten()))
_, counts = np.unique(c, return_counts=True)
> self.assertTrue((counts == 1).all())
E AssertionError: False is not true
heat/core/tests/test_random.py:166: AssertionError
___________________________ TestRandom.test_randint ____________________________
self = <heat.core.tests.test_random.TestRandom testMethod=test_randint>
def test_randint(self):
# Checked that the random values are in the correct range
a = ht.random.randint(low=0, high=10, size=(10, 10), dtype=ht.int64)
self.assertEqual(a.dtype, ht.int64)
a = a.numpy()
self.assertTrue(((0 <= a) & (a < 10)).all())
a = ht.random.randint(low=100000, high=150000, size=(31, 25, 11), dtype=ht.int64, split=2)
a = a.numpy()
self.assertTrue(((100000 <= a) & (a < 150000)).all())
# For the range [0, 1) only the value 0 is allowed
a = ht.random.randint(1, size=(10,), split=0, dtype=ht.int64)
b = ht.zeros((10,), dtype=ht.int64, split=0)
self.assertTrue(ht.equal(a, b))
# size parameter allows int arguments
a = ht.random.randint(1, size=10, split=0, dtype=ht.int64)
self.assertTrue(ht.equal(a, b))
# size is None
a = ht.random.randint(0, 10)
self.assertEqual(a.shape, ())
# Two arrays with the same seed and same number of elements have the same random values
ht.random.seed(13579)
shape = (15, 13, 9, 21, 65)
a = ht.random.randint(15, 100, size=shape, split=0, dtype=ht.int64)
a = a.numpy().flatten()
ht.random.seed(13579)
elements = np.prod(shape)
b = ht.random.randint(low=15, high=100, size=(elements,), dtype=ht.int64)
b = b.numpy()
self.assertTrue(np.array_equal(a, b))
# Two arrays with the same seed and shape have identical values
ht.random.seed(13579)
a = ht.random.randint(10000, size=shape, split=2, dtype=ht.int64)
a = a.numpy()
ht.random.seed(13579)
b = ht.random.randint(low=0, high=10000, size=shape, split=2, dtype=ht.int64)
b = b.numpy()
ht.random.seed(13579)
c = ht.random.randint(low=0, high=10000, dtype=ht.int64)
self.assertTrue(np.equal(b[0, 0, 0, 0, 0], c))
self.assertTrue(np.array_equal(a, b))
mean = np.mean(a)
median = np.median(a)
std = np.std(a)
# Mean and median should be in the center while the std is very high due to an even distribution
self.assertTrue(4900 < mean < 5100)
self.assertTrue(4900 < median < 5100)
self.assertTrue(std < 2900)
with self.assertRaises(ValueError):
ht.random.randint(5, 5, size=(10, 10), split=0)
with self.assertRaises(ValueError):
ht.random.randint(low=0, high=10, size=(3, -4))
with self.assertRaises(ValueError):
ht.random.randint(low=0, high=10, size=(15,), dtype=ht.float32)
# int32 tests
ht.random.seed(4545)
a = ht.random.randint(50, 1000, size=(13, 45), dtype=ht.int32, split=0)
ht.random.set_state(("Threefry", 4545, 0x10000000000000000))
b = ht.random.randint(50, 1000, size=(13, 45), dtype=ht.int32, split=0)
self.assertEqual(a.dtype, ht.int32)
self.assertEqual(a.larray.dtype, torch.int32)
self.assertEqual(b.dtype, ht.int32)
a = a.numpy()
b = b.numpy()
self.assertEqual(a.dtype, np.int32)
self.assertTrue(np.array_equal(a, b))
self.assertTrue(((50 <= a) & (a < 1000)).all())
self.assertTrue(((50 <= b) & (b < 1000)).all())
c = ht.random.randint(50, 1000, size=(13, 45), dtype=ht.int32, split=0)
c = c.numpy()
self.assertFalse(np.array_equal(a, c))
self.assertFalse(np.array_equal(b, c))
self.assertTrue(((50 <= c) & (c < 1000)).all())
ht.random.seed(0xFFFFFFF)
a = ht.random.randint(
10000, size=(123, 42, 13, 21), split=3, dtype=ht.int32, comm=ht.MPI_WORLD
)
a = a.numpy()
mean = np.mean(a)
median = np.median(a)
std = np.std(a)
# Mean and median should be in the center while the std is very high due to an even distribution
> self.assertTrue(4900 < mean < 5100)
E AssertionError: False is not true
heat/core/tests/test_random.py:336: AssertionError
____________________________ TestRandom.test_randn _____________________________
self = <heat.core.tests.test_random.TestRandom testMethod=test_randn>
def test_randn(self):
# Test that the random values have the correct distribution
ht.random.seed(54321)
shape = (5, 10, 13, 23, 15, 20)
a = ht.random.randn(*shape, split=0, dtype=ht.float64)
self.assertEqual(a.dtype, ht.float64)
a = a.numpy()
mean = np.mean(a)
median = np.median(a)
std = np.std(a)
self.assertTrue(-0.01 < mean < 0.01)
self.assertTrue(-0.01 < median < 0.01)
self.assertTrue(0.99 < std < 1.01)
# Compare to a second array with a different shape but same number of elements and same seed
ht.random.seed(54321)
elements = np.prod(shape)
b = ht.random.randn(elements, split=0, dtype=ht.float64)
b = b.numpy()
a = a.flatten()
self.assertTrue(np.allclose(a, b))
# Creating the same array two times without resetting seed results in different elements
c = ht.random.randn(elements, split=0, dtype=ht.float64)
c = c.numpy()
self.assertEqual(c.shape, b.shape)
self.assertFalse(np.allclose(b, c))
# All the created values should be different
d = np.concatenate((b, c))
_, counts = np.unique(d, return_counts=True)
self.assertTrue((counts == 1).all())
# Two arrays are the same for same seed and split-axis != 0
ht.random.seed(12345)
a = ht.random.randn(*shape, split=5, dtype=ht.float64)
ht.random.seed(12345)
b = ht.random.randn(*shape, split=5, dtype=ht.float64)
self.assertTrue(ht.equal(a, b))
a = a.numpy()
b = b.numpy()
> self.assertTrue(np.allclose(a, b))
E AssertionError: False is not true
heat/core/tests/test_random.py:388: AssertionError
Interestingly, all failing tests involve calls to NumPy (?)
This correlation exist because we use NumPy for calculating the test results here. We didn't had or still don't have some of its functionalities. However, it might be a MPI / memory issue as we convert large tensors here. The value of count in the first instance is [5297 1 1 ... 1 1 1]
on the jobs that run through. The other configurations error with segmentation fault two lines earlier on the call to the numpy() method.
On the other hand, all runs succeed when switching to mpich except for python 3.9 where it hangs before the tests even start.
Thanks for looking into this. Has the OpenMPI version changed with respect to the last tests that ran through?
As for test_rand
, the problem arises in the numpy()
call (involving Allgatherv
) of a DNDarray with split > 3.
# Two large arrays that were created after each other don't share any values
b = ht.random.rand(14, 7, 3, 12, 18, 42, split=5, comm=ht.MPI_WORLD, dtype=ht.float64)
c = np.concatenate((a.flatten(), b.numpy().flatten()))
The random values are actually unique and all counts are 1 locally before the numpy()
call, afterwards thousands of values are 0 or 1e-322 or similar. This doesn't occur and the test passes if b.split <= 3
. UPDATE 16 Dec 2023 The problem does not occur if the gathered array is small. Ongoing work:
Note that this action is using runner version 20231205.1 vs. the last time the matrix passed (20231115.7).
I haven't had time to research which of the updated libraries in the new runner might lead to this behaviour. OpenMPI and mpi4py versions are exactly the same.
This issue was introduced in the latest version 1.2.1 of setup-mpi action. We set it back to 1.2.0 for now.
I have tried the tests on the GPU-partition of Terrabyte with CUDA 11.5.0, OpenMPI 4.1.2, and got:
test_rand (heat.core.tests.test_random.TestRandom) ... [hpdar01c05s02:31610:0:31610] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x15544208cff0)
test_rand (heat.core.tests.test_random.TestRandom) ... FAIL
======================================================================
FAIL: test_rand (heat.core.tests.test_random.TestRandom)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/dss/dsshome1/03/di93zek/heat/heat/core/tests/test_random.py", line 166, in test_rand
self.assertTrue((counts == 1).all())
AssertionError: False is not true
----------------------------------------------------------------------
Ran 283 tests in 38.628s
FAILED (failures=1, skipped=5)
and similar on the CPU-partition:
test_rand (heat.core.tests.test_random.TestRandom) ... [hpdar03c02s12:65255:0:65255] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x1484b4021088)
test_rand (heat.core.tests.test_random.TestRandom) ... FAIL
======================================================================
FAIL: test_rand (heat.core.tests.test_random.TestRandom)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/dss/dsshome1/03/di93zek/heat/heat/core/tests/test_random.py", line 166, in test_rand
self.assertTrue((counts == 1).all())
AssertionError: False is not true
----------------------------------------------------------------------
Ran 283 tests in 36.532s
FAILED (failures=1, skipped=5)
Seems indeed to be some MPI-error (?)... Interestingly, this problem did never occure before and I did not change the setup in any way
Some updates:
ht.ones(...)
and .numpy()
:
import heat as ht
import numpy as np
shape = (1825,10,10) split=2
X = ht.ones(shape,dtype=ht.float32,device="gpu", split=split)
print(X.comm.rank, X.larray.device)
Y = X.numpy() Y_true = np.ones(shape)
print(X.comm.rank,(Y==Y_true).all())
* I can reproduce the bug **on GPU and CPU** als long as I use **at least two nodes** (e.g., 2 nodes with 1 task per node for the above example)
* the above configuration is roughly the smallest one I could find; for shape=(1800,10,10) the problem did not appear
* I did not find configurations with ndim <=2 or split <=1 that were able to reproduce the bug; in particular, for the same shape=(1825,10,10) and split=1 the problem does not appear
* note that all process-local chunks of data have the same size in the example (10 is divisible by the number of processes) which should exclude that the error comes from sth due to different lshapes
* when using float64 instead of float32, shape=(912,10,10) was able to reproduce the bug and shape=(900,10,10) not... therefore, the bug seems to be related rather to the size of the underlying data rather in terms of memory than in terms of shape
The actual error seems to happen inside Allgatherv
:
.numpy()
consists of a .resplit_(axis=None)
and application of .cpu().numpy()
to the resulting larrays
's of the unsplit tensor .resplit_(axis=None)
essentially performs an Allgatherv
.
What happened?
The Python version vs. PyTorch version cross-checks we run after a PR is approved are failing. Something has changed in the jobs set up that we couldn't pinpoint yet.
Code snippet triggering the error
Error message or erroneous outcome
Version
1.3.x
Python version
3.10
PyTorch version
1.13
MPI version