LaurentMazare / tch-rs

Rust bindings for the C++ api of PyTorch.
Apache License 2.0
4.2k stars 330 forks source link

RuntimeError: Expected all tensors to be on the same device #608

Closed alarst13 closed 1 year ago

alarst13 commented 1 year ago

Hi! I have a pre-trained Torch model that was trained on GPU (using CUDA acceleration). I scripted this model with torch.jit.trace, and I used the same code provided in one of this repo's examples to run an inference in Rust using my scripted model model.pt.

use std::env;
use anyhow::{bail, Result};
use tch::vision::imagenet;
use tch::Kind::Float;

fn main() -> Result<()> {
    // Get CLI arguments at runtime
    let args: Vec<String> = env::args().collect();

    // Parse arguments
    // The first argument is the model path
    // The second argument is the input image to be classified 
    let (model_file, image_file) = match args.as_slice() {
        [_, m, i] => (m.to_owned(), i.to_owned()),
        _ => bail!("usage: main model.pt tiger.jpg"),
    };

    // Load, resize the image to fit the classifier's tensors
    // MiniONN's standard 32 x 32
    let image = imagenet::load_image_and_resize(image_file, 32, 32)?;

    // Load the TorchScript traced model
    let model = tch::CModule::load(model_file)?;

    let output = model.forward_ts(&[image.unsqueeze(0)])?.softmax(-1, Float);

    // Pass the image through the network and apply a softmax layer
    // to extract the learned the classes
    // let output = image
    //     .unsqueeze(0)
    //     .apply(&model)
    //     .softmax(-1, Float);

    // Iterate through the top-5 results,
    // print the probability and class for each
    for (probability, class) in imagenet::top(&output, 5).iter() {
        println!("{:50} {:5.2}%", class, 100.0 * probability)
    }
    Ok(())
}

However, I keep getting this error. Do you know how I can fix this?

Error: Internal torch error: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
  File "code/__torch__/detectron2/export/flatten.py", line 9, in forward
  def forward(self: __torch__.detectron2.export.flatten.TracingAdapter,
    input: Tensor) -> Tuple[Tensor]:
    return ((self.model).forward(input, ),)
             ~~~~~~~~~~~~~~~~~~~ <--- HERE
  File "code/__torch__/minionn.py", line 27, in forward
    _7 = self.pooling1
    _8 = self.conv2
    input0 = torch.relu_((self.conv1).forward(input, ))
                          ~~~~~~~~~~~~~~~~~~~ <--- HERE
    input1 = torch.relu_((_8).forward(input0, ))
    _9 = (_6).forward((_7).forward(input1, ), )
  File "code/__torch__/torch/nn/modules/conv.py", line 9, in forward
  def forward(self: __torch__.torch.nn.modules.conv.Conv2d,
    input: Tensor) -> Tensor:
    input0 = torch._convolution(input, self.weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 1, False, False, True, True)
             ~~~~~~~~~~~~~~~~~~ <--- HERE
    return input0

Traceback of TorchScript, original code (most recent call last):
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/nn/modules/conv.py(439): _conv_forward
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/nn/modules/conv.py(443): forward
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/nn/modules/module.py(1039): _slow_forward
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/nn/modules/module.py(1051): _call_impl
/home/cc/research/AdderNet/minionn.py(19): forward
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/nn/modules/module.py(1039): _slow_forward
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/nn/modules/module.py(1051): _call_impl
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/detectron2/export/flatten.py(259): <lambda>
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/detectron2/export/flatten.py(294): forward
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/nn/modules/module.py(1039): _slow_forward
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/nn/modules/module.py(1051): _call_impl
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/jit/_trace.py(952): trace_module
/home/cc/anaconda3/envs/mlenv/lib/python3.8/site-packages/torch/jit/_trace.py(735): trace
/home/cc/research/AdderNet/main.py(169): main
/home/cc/research/AdderNet/main.py(175): <module>
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument weight in method wrapper___slow_conv2d_forward)
alarst13 commented 1 year ago

To fix my problem I loaded the image back on GPU and everything went smoothly.

let mut image = imagenet::load_image_and_resize(image_file, 32, 32)?;
image = image.to_device(Device::Cuda(0));