Open MEllis-github opened 1 year ago
I confirmed setting WITH_ROOT=1
in WITH_ROOT=1 pip install -v --no-cache-dir .
in the image build succeeds and causes the tensornvme check
to pass. Is there any reason to not include this in the official image build?
While the above enables tensornvme check
to pass, the pytest ./tests
for backend uring
still fail (output below). Checking the build, it appears uring availability is checked, not found, and yet the build does not proceed to install it. The environment was docker.io/hpcaitech/colossalai:0.2.7 with a reinstallation of tensornvme as described above.
============================= test session starts ==============================
platform linux -- Python 3.9.12, pytest-7.3.1, pluggy-1.0.0
rootdir: /workspace/TensorNVMe
collected 9 items
tests/test_adam.py F [ 11%]
tests/test_disk_offloader.py F.F.F.F. [100%]
=================================== FAILURES ===================================
__________________________________ test_adam ___________________________________
@torch.no_grad()
def test_adam():
params = list(gpt2_toy().cpu().parameters())
for _, p in enumerate(params):
if p.grad is None and p.requires_grad:
p.grad = torch.ones_like(p.data, dtype=torch.float) * 0.12345
params_gt = copy.deepcopy(params)
for _, p in enumerate(params_gt):
if p.grad is None and p.requires_grad:
p.grad = torch.ones_like(p.data, dtype=torch.float) * 0.12345
optimizer = Adam(params_gt, 1e-3)
optimizer.step()
test_config = [
{'n_entries': 1, 'backend': None, 'prefetch': 0, 'vecio': False},
{'n_entries': 1, 'backend': 'uring', 'prefetch': 0, 'vecio': False},
{'n_entries': 8, 'backend': 'uring', 'prefetch': 2, 'vecio': False},
{'n_entries': 1, 'backend': 'uring', 'prefetch': 0, 'vecio': True},
{'n_entries': 8, 'backend': 'uring', 'prefetch': 2, 'vecio': True},
{'n_entries': 1, 'backend': 'aio', 'prefetch': 0, 'vecio': False},
{'n_entries': 8, 'backend': 'aio', 'prefetch': 2, 'vecio': False},
{'n_entries': 1, 'backend': 'aio', 'prefetch': 0, 'vecio': True},
{'n_entries': 8, 'backend': 'aio', 'prefetch': 2, 'vecio': True},
]
for i, cfg in enumerate(test_config):
params_test = copy.deepcopy(params)
for _, p in enumerate(params_test):
if p.grad is None and p.requires_grad:
p.grad = torch.ones_like(p.data, dtype=torch.float) * 0.12345
if cfg['backend'] is None:
offloader = None
else:
> offloader = DiskOffloader(
'.', cfg['n_entries'], backend=cfg['backend'])
tests/test_adam.py:221:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <tensornvme.offload.DiskOffloader object at 0x7f2c77f2c720>
dir_name = '.', n_entries = 1, backend = 'uring'
def __init__(self, dir_name: str, n_entries: int = 16, backend: str = 'uring') -> None:
> assert backend in get_backends(
), f'Unsupported backend: {backend}, please install tensornvme with this backend'
E AssertionError: Unsupported backend: uring, please install tensornvme with this backend
/opt/conda/lib/python3.9/site-packages/tensornvme/offload.py:10: AssertionError
_____________________________ test_sync_io[uring] ______________________________
backend = 'uring'
@pytest.mark.parametrize('backend', ['uring', 'aio'])
def test_sync_io(backend):
x = torch.rand(2, 2)
x_copy = x.clone()
> of = DiskOffloader('.', backend=backend)
tests/test_disk_offloader.py:10:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <tensornvme.offload.DiskOffloader object at 0x7f2c77f2c9f0>
dir_name = '.', n_entries = 16, backend = 'uring'
def __init__(self, dir_name: str, n_entries: int = 16, backend: str = 'uring') -> None:
> assert backend in get_backends(
), f'Unsupported backend: {backend}, please install tensornvme with this backend'
E AssertionError: Unsupported backend: uring, please install tensornvme with this backend
/opt/conda/lib/python3.9/site-packages/tensornvme/offload.py:10: AssertionError
_____________________________ test_async_io[uring] _____________________________
backend = 'uring'
@pytest.mark.parametrize('backend', ['uring', 'aio'])
def test_async_io(backend):
x = torch.rand(2, 2)
x_copy = x.clone()
> of = DiskOffloader('.', backend=backend)
tests/test_disk_offloader.py:26:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <tensornvme.offload.DiskOffloader object at 0x7f2c77ed47c0>
dir_name = '.', n_entries = 16, backend = 'uring'
def __init__(self, dir_name: str, n_entries: int = 16, backend: str = 'uring') -> None:
> assert backend in get_backends(
), f'Unsupported backend: {backend}, please install tensornvme with this backend'
E AssertionError: Unsupported backend: uring, please install tensornvme with this backend
/opt/conda/lib/python3.9/site-packages/tensornvme/offload.py:10: AssertionError
___________________________ test_sync_vec_io[uring] ____________________________
backend = 'uring'
@pytest.mark.parametrize('backend', ['uring', 'aio'])
def test_sync_vec_io(backend):
x = torch.rand(2, 2)
y = torch.rand(2, 2, 2)
x_copy = x.clone()
y_copy = y.clone()
> of = DiskOffloader('.', backend=backend)
tests/test_disk_offloader.py:47:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <tensornvme.offload.DiskOffloader object at 0x7f2c77edc0e0>
dir_name = '.', n_entries = 16, backend = 'uring'
def __init__(self, dir_name: str, n_entries: int = 16, backend: str = 'uring') -> None:
> assert backend in get_backends(
), f'Unsupported backend: {backend}, please install tensornvme with this backend'
E AssertionError: Unsupported backend: uring, please install tensornvme with this backend
/opt/conda/lib/python3.9/site-packages/tensornvme/offload.py:10: AssertionError
___________________________ test_async_vec_io[uring] ___________________________
backend = 'uring'
@pytest.mark.parametrize('backend', ['uring', 'aio'])
def test_async_vec_io(backend):
x = torch.rand(2, 2)
y = torch.rand(2, 2, 2)
x_copy = x.clone()
y_copy = y.clone()
> of = DiskOffloader('.', backend=backend)
tests/test_disk_offloader.py:77:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <tensornvme.offload.DiskOffloader object at 0x7f2c77edc630>
dir_name = '.', n_entries = 16, backend = 'uring'
def __init__(self, dir_name: str, n_entries: int = 16, backend: str = 'uring') -> None:
> assert backend in get_backends(
), f'Unsupported backend: {backend}, please install tensornvme with this backend'
E AssertionError: Unsupported backend: uring, please install tensornvme with this backend
/opt/conda/lib/python3.9/site-packages/tensornvme/offload.py:10: AssertionError
=========================== short test summary info ============================
FAILED tests/test_adam.py::test_adam - AssertionError: Unsupported backend: u...
FAILED tests/test_disk_offloader.py::test_sync_io[uring] - AssertionError: Un...
FAILED tests/test_disk_offloader.py::test_async_io[uring] - AssertionError: U...
FAILED tests/test_disk_offloader.py::test_sync_vec_io[uring] - AssertionError...
FAILED tests/test_disk_offloader.py::test_async_vec_io[uring] - AssertionErro...
========================= 5 failed, 4 passed in 2.21s ==========================
🐛 Describe the bug
Description
The official docker images run the TensorNVME install commands, however at runtime, executing
cd TensorNVMe && tensornvme check
(or running the training demos depending on tensornvme) producesImportError: libaio.so.1: cannot open shared object file: No such file or directory
.A few key observations from container runtime:
LD_LIBRARY_PATH =/usr/local/nvidia/lib:/usr/local/nvidia/lib64
.find / -type d -iname ".tensornvme" -ls
does not locate the install directory.tensornvme
.~
or$HOME
evaluates to/
in the container runtime./.tensornvme
:Environment
docker.io/hpcaitech/colossalai:0.2.7