🗂 Graph Learning Indexer: a contributor-friendly and metadata-rich platform for graph learning benchmarks. Dataloading, Benchmarking, Tagging, and more!
For single dataset testing (pubmed below can be replaced for other dataset name):
pytest tests/test_data_loading.py -k pubmed
The error message for pubmed:
================================================== FAILURES ===================================================
__________________________________________ test_data_loading[pubmed] __________________________________________
dataset_name = 'pubmed'
@pytest.mark.parametrize("dataset_name", find_datasets())
def test_data_loading(dataset_name):
"""Test data loading for a given dataset.
Test if get_gli_graph, get_gli_task, and get_gli_dataset
can be applied successfully.
"""
# temporary skipping all large datasets
dataset = dataset_name
test_cfg = load_config_file("tests/config.yaml")
if dataset in test_cfg["large_dataset_to_skip"]:
return
directory = os.getcwd() + "/datasets/" + dataset
task_list = []
for file in os.listdir(directory):
if fnmatch.fnmatch(file, "task*.json"):
with open(directory + "/" + file, encoding="utf-8") as f:
task_dict = json.load(f)
if task_dict["type"] not in SUPPORTED_TASK_TYPES:
f.close()
return
task_list.append(task_dict["type"])
try:
> _ = gli.dataloading.get_gli_graph(dataset)
tests/test_data_loading.py:35:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
dataset = 'pubmed', device = 'cpu', verbose = False
def get_gli_graph(dataset: str,
device: str = "cpu",
verbose: bool = False) -> Union[DGLGraph, List[DGLGraph]]:
# pylint: disable=line-too-long
"""Get a GLI graph object, or a list of GLI graph objects.
If the metadata defines multiple subgraphs on the dataset, the returned
value is a list rather than a single graph.
Parameters
----------
dataset : str
Dataset/Graph name
device : str, optional
Task type, by default "cpu"
verbose : bool, optional
Verbose level, by default False
Returns
-------
Union[DGLGraph, List[DGLGraph]]
Graph dataset instance
Raises
------
FileNotFoundError
Raised when metadata/task configuration file is not found.
Examples
--------
>>> g = get_gli_graph(dataset="cora")
>>> g
Graph(num_nodes=2708, num_edges=10556,
ndata_schemes={'NodeFeature': Scheme(shape=(1433,), dtype=torch.float32), 'NodeLabel': Scheme(shape=(), dtype=torch.int64)}
edata_schemes={})
Notes
-----
The returned graph(s) is essentially DGLGraph with extra attributes defined
by GLI.
""" # noqa: E501
data_dir = os.path.join(ROOT_PATH, "datasets/", dataset)
metadata_path = os.path.join(data_dir, "metadata.json")
if not os.path.isdir(data_dir):
raise FileNotFoundError(f"{data_dir} not found.")
if not os.path.exists(metadata_path):
raise FileNotFoundError(f"{metadata_path} not found.")
download_data(dataset, verbose=verbose)
> return read_gli_graph(metadata_path, device=device, verbose=verbose)
gli/dataloading.py:141:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
metadata_path = '/Users/jinhuang/Documents/research/gli/datasets/pubmed/metadata.json', device = 'cpu'
verbose = False
def read_gli_graph(metadata_path: os.PathLike, device="cpu", verbose=True):
"""Initialize and return a Graph instance given metadata.json."""
pwd = os.path.dirname(metadata_path)
with open(metadata_path, "r", encoding="utf-8") as fptr:
metadata = json.load(fptr)
if verbose:
print(metadata["description"])
assert _is_hetero_graph(metadata) == metadata[
"is_heterogeneous"], "is_heterogeneous attribute is inconsistent"
hetero = metadata["is_heterogeneous"]
name = metadata["description"]
assert "data" in metadata, "attribute `data` not in metadata.json."
for neg in ["Node", "Edge", "Graph"]:
assert neg in metadata[
"data"], f"attribute `{neg}` not in metadata.json"
data = copy(metadata["data"])
> data = _dfs_read_file(pwd, data, device="cpu")
gli/graph.py:40:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pwd = '/Users/jinhuang/Documents/research/gli/datasets/pubmed'
d = {'Edge': {'_Edge': {'file': 'pubmed.npz', 'key': 'edge'}}, 'Graph': {'_EdgeList': {'file': 'pubmed.npz', 'key': 'edge_...bels of Pubmed dataset, int ranged from 1 to 3.', 'file': 'pubmed.npz', 'format': 'Tensor', 'key': 'node_class', ...}}}
device = 'cpu'
def _dfs_read_file(pwd, d, device="cpu"):
"""Read file efficiently."""
> return _dfs_read_file_helper(pwd, d, device)
gli/graph.py:223:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pwd = '/Users/jinhuang/Documents/research/gli/datasets/pubmed'
d = {'Edge': {'_Edge': {'file': 'pubmed.npz', 'key': 'edge'}}, 'Graph': {'_EdgeList': {'file': 'pubmed.npz', 'key': 'edge_...bels of Pubmed dataset, int ranged from 1 to 3.', 'file': 'pubmed.npz', 'format': 'Tensor', 'key': 'node_class', ...}}}
device = 'cpu'
def _dfs_read_file_helper(pwd, d, device="cpu"):
"""Read file recursively (helper of `_dfs_read_file`)."""
if "file" in d:
path = os.path.join(pwd, d["file"])
return load_data(path, d.get("key"), device)
empty_keys = []
for k in d:
> entry = _dfs_read_file_helper(pwd, d[k], device=device)
gli/graph.py:234:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pwd = '/Users/jinhuang/Documents/research/gli/datasets/pubmed'
d = {'NodeFeature': {'description': 'Node features of Pubmed dataset.', 'file': 'pubmed.npz', 'format': 'SparseTensor', 'k...abels of Pubmed dataset, int ranged from 1 to 3.', 'file': 'pubmed.npz', 'format': 'Tensor', 'key': 'node_class', ...}}
device = 'cpu'
def _dfs_read_file_helper(pwd, d, device="cpu"):
"""Read file recursively (helper of `_dfs_read_file`)."""
if "file" in d:
path = os.path.join(pwd, d["file"])
return load_data(path, d.get("key"), device)
empty_keys = []
for k in d:
> entry = _dfs_read_file_helper(pwd, d[k], device=device)
gli/graph.py:234:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pwd = '/Users/jinhuang/Documents/research/gli/datasets/pubmed'
d = {'description': 'Node features of Pubmed dataset.', 'file': 'pubmed.npz', 'format': 'SparseTensor', 'key': 'node_feats', ...}
device = 'cpu'
def _dfs_read_file_helper(pwd, d, device="cpu"):
"""Read file recursively (helper of `_dfs_read_file`)."""
if "file" in d:
path = os.path.join(pwd, d["file"])
> return load_data(path, d.get("key"), device)
gli/graph.py:230:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
path = '/Users/jinhuang/Documents/research/gli/datasets/pubmed/pubmed.npz', key = 'node_feats', device = 'cpu'
def load_data(path, key=None, device="cpu"):
"""Load data from npy or npz file, return sparse array or torch tensor.
Parameters
----------
path : str
Path to data file
key : str, optional
by default None
device : str, optional
by default "cpu"
Returns
-------
torch.Tensor or scipy.sparse.matrix
Raises
------
TypeError
Unrecognized file extension
"""
_, ext = os.path.splitext(path)
if ext not in (".npz", ".npy"):
raise TypeError(f"Invalid file extension {ext}.")
if path.endswith(".sparse.npz"):
# Sparse matrix
assert key is None, "Sparse format cannot contain key."
return sp.load_npz(path)
# Dense arrays file with a key
> raw = np.load(path, allow_pickle=False)
gli/utils.py:177:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
file = '/Users/jinhuang/Documents/research/gli/datasets/pubmed/pubmed.npz', mmap_mode = None
allow_pickle = False, fix_imports = True, encoding = 'ASCII'
@set_module('numpy')
def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
encoding='ASCII'):
"""
Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files.
.. warning:: Loading files that contain object arrays uses the ``pickle``
module, which is not secure against erroneous or maliciously
constructed data. Consider passing ``allow_pickle=False`` to
load data that is known not to contain object arrays for the
safer handling of untrusted sources.
Parameters
----------
file : file-like object, string, or pathlib.Path
The file to read. File-like objects must support the
``seek()`` and ``read()`` methods and must always
be opened in binary mode. Pickled files require that the
file-like object support the ``readline()`` method as well.
mmap_mode : {None, 'r+', 'r', 'w+', 'c'}, optional
If not None, then memory-map the file, using the given mode (see
`numpy.memmap` for a detailed description of the modes). A
memory-mapped array is kept on disk. However, it can be accessed
and sliced like any ndarray. Memory mapping is especially useful
for accessing small fragments of large files without reading the
entire file into memory.
allow_pickle : bool, optional
Allow loading pickled object arrays stored in npy files. Reasons for
disallowing pickles include security, as loading pickled data can
execute arbitrary code. If pickles are disallowed, loading object
arrays will fail. Default: False
.. versionchanged:: 1.16.3
Made default False in response to CVE-2019-6446.
fix_imports : bool, optional
Only useful when loading Python 2 generated pickled files on Python 3,
which includes npy/npz files containing object arrays. If `fix_imports`
is True, pickle will try to map the old Python 2 names to the new names
used in Python 3.
encoding : str, optional
What encoding to use when reading Python 2 strings. Only useful when
loading Python 2 generated pickled files in Python 3, which includes
npy/npz files containing object arrays. Values other than 'latin1',
'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
data. Default: 'ASCII'
Returns
-------
result : array, tuple, dict, etc.
Data stored in the file. For ``.npz`` files, the returned instance
of NpzFile class must be closed to avoid leaking file descriptors.
Raises
------
OSError
If the input file does not exist or cannot be read.
UnpicklingError
If ``allow_pickle=True``, but the file cannot be loaded as a pickle.
ValueError
The file contains an object array, but ``allow_pickle=False`` given.
See Also
--------
save, savez, savez_compressed, loadtxt
memmap : Create a memory-map to an array stored in a file on disk.
lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.
Notes
-----
- If the file contains pickle data, then whatever object is stored
in the pickle is returned.
- If the file is a ``.npy`` file, then a single array is returned.
- If the file is a ``.npz`` file, then a dictionary-like object is
returned, containing ``{filename: array}`` key-value pairs, one for
each file in the archive.
- If the file is a ``.npz`` file, the returned value supports the
context manager protocol in a similar fashion to the open function::
with load('foo.npz') as data:
a = data['a']
The underlying file descriptor is closed when exiting the 'with'
block.
Examples
--------
Store data to disk, and load it again:
>>> np.save('/tmp/123', np.array([[1, 2, 3], [4, 5, 6]]))
>>> np.load('/tmp/123.npy')
array([[1, 2, 3],
[4, 5, 6]])
Store compressed data to disk, and load it again:
>>> a=np.array([[1, 2, 3], [4, 5, 6]])
>>> b=np.array([1, 2])
>>> np.savez('/tmp/123.npz', a=a, b=b)
>>> data = np.load('/tmp/123.npz')
>>> data['a']
array([[1, 2, 3],
[4, 5, 6]])
>>> data['b']
array([1, 2])
>>> data.close()
Mem-map the stored array, and then access the second row
directly from disk:
>>> X = np.load('/tmp/123.npy', mmap_mode='r')
>>> X[1, :]
memmap([4, 5, 6])
"""
if encoding not in ('ASCII', 'latin1', 'bytes'):
# The 'encoding' value for pickle also affects what encoding
# the serialized binary data of NumPy arrays is loaded
# in. Pickle does not pass on the encoding information to
# NumPy. The unpickling code in numpy.core.multiarray is
# written to assume that unicode data appearing where binary
# should be is in 'latin1'. 'bytes' is also safe, as is 'ASCII'.
#
# Other encoding values can corrupt binary data, and we
# purposefully disallow them. For the same reason, the errors=
# argument is not exposed, as values other than 'strict'
# result can similarly silently corrupt numerical data.
raise ValueError("encoding must be 'ASCII', 'latin1', or 'bytes'")
pickle_kwargs = dict(encoding=encoding, fix_imports=fix_imports)
with contextlib.ExitStack() as stack:
if hasattr(file, 'read'):
fid = file
own_fid = False
else:
fid = stack.enter_context(open(os_fspath(file), "rb"))
own_fid = True
# Code to distinguish from NumPy binary files and pickles.
_ZIP_PREFIX = b'PK\x03\x04'
_ZIP_SUFFIX = b'PK\x05\x06' # empty zip files start with this
N = len(format.MAGIC_PREFIX)
magic = fid.read(N)
# If the file size is less than N, we need to make sure not
# to seek past the beginning of the file
fid.seek(-min(N, len(magic)), 1) # back-up
if magic.startswith(_ZIP_PREFIX) or magic.startswith(_ZIP_SUFFIX):
# zip-file (assume .npz)
# Potentially transfer file ownership to NpzFile
stack.pop_all()
ret = NpzFile(fid, own_fid=own_fid, allow_pickle=allow_pickle,
pickle_kwargs=pickle_kwargs)
return ret
elif magic == format.MAGIC_PREFIX:
# .npy file
if mmap_mode:
return format.open_memmap(file, mode=mmap_mode)
else:
return format.read_array(fid, allow_pickle=allow_pickle,
pickle_kwargs=pickle_kwargs)
else:
# Try a pickle
if not allow_pickle:
> raise ValueError("Cannot load file containing pickled data "
"when allow_pickle=False")
E ValueError: Cannot load file containing pickled data when allow_pickle=False
../../../opt/miniconda3/envs/gli/lib/python3.10/site-packages/numpy/lib/npyio.py:418: ValueError
During handling of the above exception, another exception occurred:
dataset_name = 'pubmed'
@pytest.mark.parametrize("dataset_name", find_datasets())
def test_data_loading(dataset_name):
"""Test data loading for a given dataset.
Test if get_gli_graph, get_gli_task, and get_gli_dataset
can be applied successfully.
"""
# temporary skipping all large datasets
dataset = dataset_name
test_cfg = load_config_file("tests/config.yaml")
if dataset in test_cfg["large_dataset_to_skip"]:
return
directory = os.getcwd() + "/datasets/" + dataset
task_list = []
for file in os.listdir(directory):
if fnmatch.fnmatch(file, "task*.json"):
with open(directory + "/" + file, encoding="utf-8") as f:
task_dict = json.load(f)
if task_dict["type"] not in SUPPORTED_TASK_TYPES:
f.close()
return
task_list.append(task_dict["type"])
try:
_ = gli.dataloading.get_gli_graph(dataset)
except (AssertionError,
AttributeError,
ModuleNotFoundError,
IndexError,
ValueError) as e:
print(e, dataset, "graph loading failed")
> assert False
E assert False
tests/test_data_loading.py:42: AssertionError
-------------------------------------------- Captured stdout call ---------------------------------------------
{'large_dataset_to_skip': ['wiki', 'ogbg-code2']}
Cannot load file containing pickled data when allow_pickle=False pubmed graph loading failed
============================================== warnings summary ===============================================
../../../opt/miniconda3/envs/gli/lib/python3.10/site-packages/scipy/__init__.py:146
/Users/jinhuang/opt/miniconda3/envs/gli/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
../../../opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/backend/pytorch/tensor.py:16
../../../opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/backend/pytorch/tensor.py:16
/Users/jinhuang/opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/backend/pytorch/tensor.py:16: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
if LooseVersion(th.__version__) < LooseVersion("1.9.0"):
../../../opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/backend/pytorch/tensor.py:340
../../../opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/backend/pytorch/tensor.py:340
/Users/jinhuang/opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/backend/pytorch/tensor.py:340: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
if LooseVersion(th.__version__) >= LooseVersion("1.10.0"):
../../../opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/dataloading/dataloader.py:32
/Users/jinhuang/opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/dataloading/dataloader.py:32: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
PYTORCH_VER = LooseVersion(torch.__version__)
../../../opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/_dataloading/pytorch/dataloader.py:24
/Users/jinhuang/opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/_dataloading/pytorch/dataloader.py:24: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
PYTORCH_VER = LooseVersion(th.__version__)
../../../opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/_dataloading/pytorch/dataloader.py:25
/Users/jinhuang/opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/_dataloading/pytorch/dataloader.py:25: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
PYTORCH_16 = PYTORCH_VER >= LooseVersion("1.6.0")
../../../opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/_dataloading/pytorch/dataloader.py:26
/Users/jinhuang/opt/miniconda3/envs/gli/lib/python3.10/site-packages/dgl-0.9.0-py3.10-macosx-11.1-arm64.egg/dgl/_dataloading/pytorch/dataloader.py:26: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
PYTORCH_17 = PYTORCH_VER >= LooseVersion("1.7.0")
tests/test_data_loading.py::test_data_loading[pubmed]
/Users/jinhuang/Documents/research/gli/gli/utils.py:137: UserWarning: We detected some HTML elements in the downloaded file. This most likely means that the download triggered an unhandled API response by GDrive. Please report this to torchvision at https://github.com/pytorch/vision/issues including the response:
<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"/><title>Sorry...</title><style> body { font-family: verdana, arial, sans-serif; background-color: #fff; color: #000; }</style></head><body><div><table><tr><td><b><font face=sans-serif size=10><font color=#4285f4>G</font><font color=#ea4335>o</font><font color=#fbbc05>o</font><font color=#4285f4>g</font><font color=#34a853>l</font><font color=#ea4335>e</font></font></b></td><td style="text-align: left; vertical-align: bottom; padding-bottom: 15px; width: 50%"><div style="border-bottom: 1px solid #dfdfdf;">Sorry...</div></td></tr></table></div><div style="margin-left: 4em;"><h1>We're sorry...</h1><p>... but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.</p></div><div style="margin-left: 4em;">See <a href="https://support.google.com/websearch/answer/86640">Google Help</a> for more information.<br/><br/></div><div style="text-align: center; border-top: 1px solid #dfdfdf;"><a href="https://www.google.com">Google Home</a></div></body></html>
warnings.warn(
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
=========================================== short test summary info ===========================================
FAILED tests/test_data_loading.py::test_data_loading[pubmed] - assert False
=============================== 1 failed, 37 deselected, 10 warnings in 13.66s ================================
Describe the bug As titled, only around 5 datasets passed loading. Possible because of https://github.com/Graph-Learning-Benchmarks/gli/pull/325
To Reproduce For complete testing:
For single dataset testing (pubmed below can be replaced for other dataset name):
The error message for
pubmed
: