dgl._ffi.base.DGLError: [13:15:35] /opt/dgl/src/array/cuda/spmm.cu:213: Check failed: e == CUSPARSE_STATUS_SUCCESS: CUSPARSE ERROR: 1

toooooodo commented 3 years ago

🐛 Bug

To Reproduce

I run the tutorial code, but errors occur.

import dgl.data
import torch.nn.functional as F
from dgl.dataloading import GraphDataLoader
from dgl.nn import GraphConv
import torch.nn as nn
import torch

class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, hidden_dim)
        self.conv2 = GraphConv(hidden_dim, hidden_dim)
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        h = F.relu(self.conv2(g, h))
        with g.local_scope():
            g.ndata['h'] = h
            # Calculate graph representation by average readout.
            hg = dgl.mean_nodes(g, 'h')
            return self.classify(hg)

if __name__ == '__main__':
    dataset = dgl.data.GINDataset('MUTAG', False)
    device = torch.device('cuda:0')
    model = Classifier(7, 20, 5).to(device)
    dataloader = GraphDataLoader(dataset,
                                 batch_size=1024,
                                 drop_last=False,
                                 shuffle=True)
    opt = torch.optim.Adam(model.parameters())
    for epoch in range(20):
        for batched_graph, labels in dataloader:
            feats = batched_graph.ndata['attr'].to(device)
            batched_graph = batched_graph.to(device)
            logits = model(batched_graph, feats)
            loss = F.cross_entropy(logits, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()

Errors:

Using backend: pytorch
Traceback (most recent call last):
  File "/home/zhuangxiang/code/test.py", line 42, in <module>
    logits = model(batched_graph, feats)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/zhuangxiang/code/test.py", line 19, in forward
    h = F.relu(self.conv1(g, h))
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/nn/pytorch/conv/graphconv.py", line 423, in forward
    graph.update_all(aggregate_fn, fn.sum(msg='m', out='h'))
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/heterograph.py", line 4686, in update_all
    ndata = core.message_passing(g, message_func, reduce_func, apply_node_func)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/core.py", line 283, in message_passing
    ndata = invoke_gspmm(g, mfunc, rfunc)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/core.py", line 258, in invoke_gspmm
    z = op(graph, x)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/ops/spmm.py", line 170, in func
    return gspmm(g, 'copy_lhs', reduce_op, x, None)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/ops/spmm.py", line 62, in gspmm
    ret = gspmm_internal(g._graph, op,
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/backend/pytorch/sparse.py", line 307, in gspmm
    return GSpMM.apply(gidx, op, reduce_op, lhs_data, rhs_data)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 213, in decorate_fwd
    return fwd(*args, **kwargs)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/backend/pytorch/sparse.py", line 87, in forward
    out, (argX, argY) = _gspmm(gidx, op, reduce_op, X, Y)
  File "/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/sparse.py", line 157, in _gspmm
    _CAPI_DGLKernelSpMM(gidx, op, reduce_op,
  File "dgl/_ffi/_cython/./function.pxi", line 287, in dgl._ffi._cy3.core.FunctionBase.__call__
  File "dgl/_ffi/_cython/./function.pxi", line 232, in dgl._ffi._cy3.core.FuncCall
  File "dgl/_ffi/_cython/./base.pxi", line 155, in dgl._ffi._cy3.core.CALL
dgl._ffi.base.DGLError: [13:15:35] /opt/dgl/src/array/cuda/spmm.cu:213: Check failed: e == CUSPARSE_STATUS_SUCCESS: CUSPARSE ERROR: 1
Stack trace:
  [bt] (0) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x4f) [0x7fb378ce90ff]
  [bt] (1) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(void dgl::aten::cusparse::CusparseCsrmm2<float, long>(DLContext const&, dgl::aten::CSRMatrix const&, float const*, float const*, float*, int)+0x762) [0x7fb3798dd852]
  [bt] (2) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(void dgl::aten::SpMMCsr<2, long, 32>(std::string const&, std::string const&, dgl::BcastOff const&, dgl::aten::CSRMatrix const&, dgl::runtime::NDArray, dgl::runtime::NDArray, dgl::runtime::NDArray, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> >)+0xdc) [0x7fb3799268ac]
  [bt] (3) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(dgl::aten::SpMM(std::string const&, std::string const&, std::shared_ptr<dgl::BaseHeteroGraph>, dgl::runtime::NDArray, dgl::runtime::NDArray, dgl::runtime::NDArray, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> >)+0x2633) [0x7fb378e3af53]
  [bt] (4) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(+0x6a7e5c) [0x7fb378e45e5c]
  [bt] (5) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(+0x6a85a1) [0x7fb378e465a1]
  [bt] (6) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/libdgl.so(DGLFuncCall+0x48) [0x7fb3793d5a98]
  [bt] (7) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/_ffi/_cy3/core.cpython-38-x86_64-linux-gnu.so(+0x15d3e) [0x7fb35e7e3d3e]
  [bt] (8) /data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl/_ffi/_cy3/core.cpython-38-x86_64-linux-gnu.so(+0x1626b) [0x7fb35e7e426b]

The above code runs correctly on cpu but goes wrong on gpu.

Environment

DGL Version (e.g., 1.0): 0.6.0.post1
Backend Library & Version (e.g., PyTorch 0.4.1, MXNet/Gluon 1.3): Pytorch 1.7.0
OS (e.g., Linux): Linux
How you installed DGL (conda, pip, source): pip
Python version: 3.8.5
CUDA/cuDNN version (if applicable): 11.0
GPU models and configuration (e.g. V100): GeForce RTX 3090

Additional context

VoVAllen commented 3 years ago

It seems you are using conda environment. Could you try pip uninstall dgl and use conda install dgl-cuda11.0 instead?

toooooodo commented 3 years ago

I try pip uninstall dgl and use conda install dgl-cuda11.0, but the same errors also occur.

VoVAllen commented 3 years ago

How did you install pytorch? Did you use conda install torch with the same cuda version as dgl? How did you install cuda also, by conda install or using system library?

toooooodo commented 3 years ago

I installed pytorch using pip install offline. I check torch and dgl version, they are compatible with cuda version. I use cuda system library, and there is no problem running the following code.

import torch

device = torch.device('cuda:0')
x = torch.randn((5,100)).to(device)
linear = torch.nn.Linear(100, 50).to(device)

x = linear(x)

toooooodo commented 3 years ago

I tried another example, and the same error occurred.

import torch.nn.functional as F
from dgl.nn import GraphConv
import torch.nn as nn
import torch
class Classifier(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, out_dim,)
    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        return h
src_ids = torch.tensor([2, 3, 4])
dst_ids = torch.tensor([1, 2, 3])
device = torch.device('cuda:0')
g = dgl.graph((src_ids, dst_ids)).to(device)
g = dgl.add_self_loop(g)
x = torch.randn((5, 100)).to(device)
model = Classifier(100, 20).to(device)
model(g, x)

yzh119 commented 3 years ago

@toooooodo pip uninstall dgl will not remove dgl-cu110 installed with pip. Please check your dgl installation path via

import dgl
print(dgl.__path__)

toooooodo commented 3 years ago

Actually I uninstalled using pip uninstall dgl-cu110, and the output of dgl.__path__ is ['/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl']

github-actions[bot] commented 2 years ago

This issue has been automatically marked as stale due to lack of activity. It will be closed if no further activity occurs. Thank you

github-actions[bot] commented 2 years ago

This issue is closed due to lack of activity. Feel free to reopen it if you still have questions.

Naviape commented 2 years ago

Whether dgl.nn. GraphConv really exists

wengcanbin1 commented 1 year ago

I come to the same problem with you , I want to know if you solve it?

shanzhiq commented 1 year ago

I also face the same problem , Does this question have any solutions now?

a1941409241 commented 11 months ago

I come to the same problem with you , I want to know if you solve it?

Hello, did you figure it out? I want to know how to slove it.

suhu93 commented 9 months ago

Actually I uninstalled using pip uninstall dgl-cu110, and the output of dgl.__path__ is ['/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl'] Hello, I have encountered the same problem. Have you resolved it yet

shanzhiq commented 8 months ago

No, I update the cuda to 11.6 and install other version

发件人: 毛日强 @.> 发送时间: 2023年12月8日 10:49 收件人: dmlc/dgl @.> 抄送: Leon stark @.>; Comment @.> 主题: Re: [dmlc/dgl] dgl._ffi.base.DGLError: [13:15:35] /opt/dgl/src/array/cuda/spmm.cu:213: Check failed: e == CUSPARSE_STATUS_SUCCESS: CUSPARSE ERROR: 1 (#2762)

Actually I uninstalled using pip uninstall dgl-cu110, and the output of dgl.path is ['/data/zhuangxiang/anaconda3/lib/python3.8/site-packages/dgl'] Hello, I have encountered the same problem. Have you resolved it yet

― Reply to this email directly, view it on GitHubhttps://github.com/dmlc/dgl/issues/2762#issuecomment-1846472032, or unsubscribehttps://github.com/notifications/unsubscribe-auth/AKRWZ62FKIKMIQCN34VYA4DYIJ55DAVCNFSM4ZMUCIL2U5DIOJSWCZC7NNSXTN2JONZXKZKDN5WW2ZLOOQ5TCOBUGY2DOMRQGMZA. You are receiving this because you commented.Message ID: @.***>

ramithuh commented 6 months ago

I also tried to get this example code working, this is what worked for me.

After a lot of back and forth trying to match python,pytorch and cuda versions [1], the following steps worked for me. (It's easier to start with a new environment because there might be lots of conflicts going on with packages)

[1] - https://www.dgl.ai/pages/start.html

## Create new environment, use arbitrary name "myenv" that you prefer
conda create -n myenv python=3.11

## Activate environment
source activate myenv

## Install pytorch 2.2 
conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=12.1 -c pytorch -c nvidia

## Install dgl which matches pytorch 2.2 and cuda 12.1 
conda install -c dglteam/label/cu121 dgl

## Add environment to jupyter kernel
conda install -c anaconda ipykernel -y
python -m ipykernel install --user --name=myenv

# install remaining things that dgl needs
pip install torchdata
pip install pandas
pip install pyyaml
pip install pydantic

I tried another example, and the same error occurred.

import torch.nn.functional as F
import dgl
from dgl.nn import GraphConv
import torch.nn as nn
import torch
class Classifier(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, out_dim,)
    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        return h
src_ids = torch.tensor([2, 3, 4])
dst_ids = torch.tensor([1, 2, 3])
device = torch.device('cuda:0')
g = dgl.graph((src_ids, dst_ids)).to(device)
g = dgl.add_self_loop(g)
x = torch.randn((5, 100)).to(device)
model = Classifier(100, 20).to(device)
model(g, x)

notabigfish commented 6 months ago

This works for me. Thanks! Anyone who meets the errors on GPU 4090 could try this solution.

I also tried to get this example code working, this is what worked for me.

After a lot of back and forth trying to match python,pytorch and cuda versions [1], the following steps worked for me. (It's easier to start with a new environment because there might be lots of conflicts going on with packages)

[1] - https://www.dgl.ai/pages/start.html

## Create new environment, use arbitrary name "myenv" that you prefer
conda create -n myenv python=3.11

## Activate environment
source activate myenv

## Install pytorch 2.2 
conda install pytorch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 pytorch-cuda=12.1 -c pytorch -c nvidia

## Install dgl which matches pytorch 2.2 and cuda 12.1 
conda install -c dglteam/label/cu121 dgl

## Add environment to jupyter kernel
conda install -c anaconda ipykernel -y
python -m ipykernel install --user --name=myenv

# install remaining things that dgl needs
pip install torchdata
pip install pandas
pip install pyyaml
pip install pydantic

I tried another example, and the same error occurred.

import torch.nn.functional as F
import dgl
from dgl.nn import GraphConv
import torch.nn as nn
import torch
class Classifier(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, out_dim,)
    def forward(self, g, h):
        # Apply graph convolution and activation.
        h = F.relu(self.conv1(g, h))
        return h
src_ids = torch.tensor([2, 3, 4])
dst_ids = torch.tensor([1, 2, 3])
device = torch.device('cuda:0')
g = dgl.graph((src_ids, dst_ids)).to(device)
g = dgl.add_self_loop(g)
x = torch.randn((5, 100)).to(device)
model = Classifier(100, 20).to(device)
model(g, x)

dmlc / dgl