aws / amazon-sagemaker-examples

Example 📓 Jupyter notebooks that demonstrate how to build, train, and deploy machine learning models using 🧠 Amazon SageMaker.
https://sagemaker-examples.readthedocs.io
Apache License 2.0
9.79k stars 6.67k forks source link

[Bug Report]Error with using dgl library in Sagemaker #4602

Open secureaiexplorer opened 3 months ago

secureaiexplorer commented 3 months ago

Link to the notebook https://github.com/aws-samples/aws-healthcare-lifescience-ai-ml-sample-notebooks/blob/main/workshops/Molecular-property-prediction/hiv-inhibitor-prediction-dgl/molecule-hiv-inhibitor-prediction-sagemaker.ipynb

Describe the bug Getting an error during import of dgl with sagemaker studio for the above notebook. I was getting this error with a GNN based notebook I am trying to run.

I decided to first run a notebook that's known to work but hit the same error.

To reproduce create a jupyterstudio notebook and run the appropriate cells from link above. Since the requirements file has an year old versions I just replace the the requirements.txt to pick the latest version.

Logs

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[1], line 7
      4 import torch.nn as nn
      5 import pandas as pd
----> 7 import dgl
      9 from dgllife.model import load_pretrained
     10 from dgllife.utils import smiles_to_bigraph, EarlyStopping, Meter, CanonicalAtomFeaturizer, CanonicalBondFeaturizer

File [/opt/conda/lib/python3.10/site-packages/dgl/__init__.py:16](https://ibsxitneoy2oj4t.studio.us-east-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/dgl/__init__.py#line=15)
     13 from .logging import enable_verbose_logging  # usort: skip
     14 from .backend import backend_name, load_backend  # usort: skip
---> 16 from . import (
     17     container,
     18     cuda,
     19     dataloading,
     20     function,
     21     ops,
     22     random,
     23     sampling,
     24     storages,
     25 )
     26 from ._ffi.base import __version__, DGLError
     27 from ._ffi.function import (
     28     extract_ext_funcs,
     29     get_global_func,
     30     list_global_func_names,
     31     register_func,
     32 )

File [/opt/conda/lib/python3.10/site-packages/dgl/dataloading/__init__.py:13](https://ibsxitneoy2oj4t.studio.us-east-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/dgl/dataloading/__init__.py#line=12)
     11 if F.get_preferred_backend() == "pytorch":
     12     from .spot_target import *
---> 13     from .dataloader import *
     14     from .dist_dataloader import *

File [/opt/conda/lib/python3.10/site-packages/dgl/dataloading/dataloader.py:27](https://ibsxitneoy2oj4t.studio.us-east-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/dgl/dataloading/dataloader.py#line=26)
     25 from ..batch import batch as batch_graphs
     26 from ..cuda import GPUCache
---> 27 from ..distributed import DistGraph
     28 from ..frame import LazyFeature
     29 from ..heterograph import DGLGraph

File [/opt/conda/lib/python3.10/site-packages/dgl/distributed/__init__.py:5](https://ibsxitneoy2oj4t.studio.us-east-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/dgl/distributed/__init__.py#line=4)
      3 from .dist_context import exit_client, initialize
      4 from .dist_dataloader import DistDataLoader
----> 5 from .dist_graph import DistGraph, DistGraphServer, edge_split, node_split
      6 from .dist_tensor import DistTensor
      7 from .graph_partition_book import GraphPartitionBook, PartitionPolicy

File [/opt/conda/lib/python3.10/site-packages/dgl/distributed/dist_graph.py:11](https://ibsxitneoy2oj4t.studio.us-east-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/dgl/distributed/dist_graph.py#line=10)
      7 from collections.abc import MutableMapping
      9 import numpy as np
---> 11 from .. import backend as F, graphbolt as gb, heterograph_index
     12 from .._ffi.ndarray import empty_shared_mem
     13 from ..base import ALL, DGLError, EID, ETYPE, is_all, NID

File [/opt/conda/lib/python3.10/site-packages/dgl/graphbolt/__init__.py:55](https://ibsxitneoy2oj4t.studio.us-east-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/dgl/graphbolt/__init__.py#line=54)
     51     except Exception:  # pylint: disable=W0703
     52         raise ImportError("Cannot load Graphbolt C++ library")
---> 55 load_graphbolt()

File [/opt/conda/lib/python3.10/site-packages/dgl/graphbolt/__init__.py:45](https://ibsxitneoy2oj4t.studio.us-east-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/dgl/graphbolt/__init__.py#line=44), in load_graphbolt()
     43 path = os.path.join(dirname, "graphbolt", basename)
     44 if not os.path.exists(path):
---> 45     raise FileNotFoundError(
     46         f"Cannot find DGL C++ graphbolt library at {path}"
     47     )
     49 try:
     50     torch.classes.load_library(path)

FileNotFoundError: Cannot find DGL C++ graphbolt library at [/opt/conda/lib/python3.10/site-packages/dgl/graphbolt/libgraphbolt_pytorch_2.0.0.post101.so](https://ibsxitneoy2oj4t.studio.us-east-2.sagemaker.aws/opt/conda/lib/python3.10/site-packages/dgl/graphbolt/libgraphbolt_pytorch_2.0.0.post101.so)