dmlc / dgl

Python package built to ease deep learning on graph, on top of existing DL frameworks.
http://dgl.ai
Apache License 2.0
13.23k stars 2.99k forks source link

[Bug] Hetero sampling on CPU #7368

Closed mfbalin closed 2 months ago

mfbalin commented 2 months ago

🔨Work Item

IMPORTANT:

Project tracker: https://github.com/orgs/dmlc/projects/2

Description

CPU hetero example issue in the regression test:

image

mfbalin commented 2 months ago
Traceback (most recent call last):
  File "/home/mfbalin/dgl-1/examples/sampling/graphbolt/lightning/../rgcn/hetero_rgcn.py", line 667, in <module>
    main(args)
  File "/home/mfbalin/dgl-1/examples/sampling/graphbolt/lightning/../rgcn/hetero_rgcn.py", line 624, in main
    train(
  File "/home/mfbalin/dgl-1/examples/sampling/graphbolt/lightning/../rgcn/hetero_rgcn.py", line 511, in train
    for data in tqdm(data_loader, desc=f"Training~Epoch {epoch + 1:02d}"):
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/tqdm/std.py", line 1181, in __iter__
    for obj in iterable:
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 629, in __next__
    data = self._next_data()
           ^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 672, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 41, in fetch
    data = next(self.dataset_iter)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 150, in __next__
    return self._get_next()
           ^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 138, in _get_next
    result = next(self.iterator)
             ^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 222, in wrap_next
    result = next_func(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/datapipe.py", line 383, in __next__
    return next(self._datapipe_iter)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 179, in wrap_generator
    response = gen.send(None)
               ^^^^^^^^^^^^^^
  File "/home/mfbalin/dgl-1/python/dgl/graphbolt/base.py", line 269, in __iter__
    yield from self.datapipe
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 179, in wrap_generator
    response = gen.send(None)
               ^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/iter/callable.py", line 124, in __iter__
    for data in self.datapipe:
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 179, in wrap_generator
    response = gen.send(None)
               ^^^^^^^^^^^^^^
  File "/home/mfbalin/dgl-1/python/dgl/graphbolt/dataloader.py", line 68, in __iter__
    yield from self.dataloader
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 629, in __next__
    data = self._next_data()
           ^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 672, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 41, in fetch
    data = next(self.dataset_iter)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 150, in __next__
    return self._get_next()
           ^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 138, in _get_next
    result = next(self.iterator)
             ^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 222, in wrap_next
    result = next_func(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/datapipe.py", line 383, in __next__
    return next(self._datapipe_iter)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 179, in wrap_generator
    response = gen.send(None)
               ^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/iter/callable.py", line 124, in __iter__
    for data in self.datapipe:
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 179, in wrap_generator
    response = gen.send(None)
               ^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/iter/callable.py", line 124, in __iter__
    for data in self.datapipe:
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 179, in wrap_generator
    response = gen.send(None)
               ^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/iter/callable.py", line 124, in __iter__
    for data in self.datapipe:
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 179, in wrap_generator
    response = gen.send(None)
               ^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/iter/callable.py", line 124, in __iter__
    for data in self.datapipe:
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 179, in wrap_generator
    response = gen.send(None)
               ^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/iter/callable.py", line 124, in __iter__
    for data in self.datapipe:
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 179, in wrap_generator
    response = gen.send(None)
               ^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/iter/callable.py", line 124, in __iter__
    for data in self.datapipe:
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/_hook_iterator.py", line 179, in wrap_generator
    response = gen.send(None)
               ^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/iter/callable.py", line 125, in __iter__
    yield self._apply_fn(data)
          ^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/.venvs/venv/lib/python3.12/site-packages/torch/utils/data/datapipes/iter/callable.py", line 90, in _apply_fn
    return self.fn(data)
           ^^^^^^^^^^^^^
  File "/home/mfbalin/dgl-1/python/dgl/graphbolt/minibatch_transformer.py", line 38, in _transformer
    minibatch = self.transformer(minibatch)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/dgl-1/python/dgl/graphbolt/impl/neighbor_sampler.py", line 172, in _sample_per_layer
    subgraph = self.sampler(
               ^^^^^^^^^^^^^
  File "/home/mfbalin/dgl-1/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py", line 705, in sample_neighbors
    return self._convert_to_sampled_subgraph(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mfbalin/dgl-1/python/dgl/graphbolt/impl/fused_csc_sampling_graph.py", line 600, in _convert_to_sampled_subgraph
    etype: CSCFormatBase(
           ^^^^^^^^^^^^^^
  File "/home/mfbalin/dgl-1/python/dgl/graphbolt/base.py", line 352, in __init__
    assert self.indptr[-1] == len(
           ^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: The last element of indptr should be the same as the length of indices.
This exception is thrown by __iter__ of SamplePerLayer(datapipe=MiniBatchTransformer, fanout=tensor([10, 10, 10, 10, 10, 10, 10]), prob_name=None, replace=False, sampler=<bound method FusedCSCSamplingGraph.sample_neighbors of FusedCSCSamplingGraph(csc_indptr=tensor([       0,        5,        9,  ..., 42221926, 42221939, 42222014],
                             dtype=torch.int32),
                      indices=tensor([1195459, 1223057, 1492639,  ..., 1625065, 1630693, 1643218],
                             dtype=torch.int32),
                      total_num_nodes=1939743, num_edges={'author:affiliated_with:institution': 1043998, 'author:writes:paper': 7145660, 'field_of_study:rev_has_topic:paper': 7505078, 'institution:rev_affiliated_with:author': 1043998, 'paper:cites:paper': 10832542, 'paper:has_topic:field_of_study': 7505078, 'paper:rev_writes:author': 7145660},
                      node_type_offset=tensor([      0, 1134649, 1194614, 1203354, 1939743], dtype=torch.int32),
                      type_per_edge=tensor([3, 6, 6,  ..., 4, 4, 4], dtype=torch.uint8),
                      node_type_to_id={'author': 0, 'field_of_study': 1, 'institution': 2, 'paper': 3},
                      edge_type_to_id={'author:affiliated_with:institution': 0, 'author:writes:paper': 1, 'field_of_study:rev_has_topic:paper': 2, 'institution:rev_affiliated_with:author': 3, 'paper:cites:paper': 4, 'paper:has_topic:field_of_study': 5, 'paper:rev_writes:author': 6},)>)
mfbalin commented 2 months ago

printing indptr[-1] and len indices:

tensor(0, dtype=torch.int32) 0
tensor(4492, dtype=torch.int32) 4492
tensor(9900, dtype=torch.int32) 9876
mfbalin commented 2 months ago

I think it is a +1 -1 bug.