PRBonn / bonnetal

Bonnet and then some! Deep Learning Framework for various Image Recognition Tasks. Photogrammetry and Robotics Lab, University of Bonn
MIT License
233 stars 60 forks source link

GPU stops working when running inference #22

Closed duda1202 closed 4 years ago

duda1202 commented 4 years ago

Hi,

I am using a GeForce RTX 2060 with bonnetal and it is crashing the GPU. I get the error:

Unable to determine the device handle for GPU 0000:01:00.0: GPU is lost. Reboot the system to recover this GPU

In this case, i am using my own code for ROS which uses the user.infer. This is the code:

#!/usr/bin/env python3
# Futures
from __future__ import print_function

# STD
import sys
import time
import argparse
import subprocess
import datetime
import os
import shutil

# ROS
import rospy
import roslib
from sensor_msgs.msg import CompressedImage

# numpy and scipy
import numpy as np
from scipy.ndimage import filters

# OpenCV
import cv2
from cv_bridge import CvBridge, CvBridgeError

# For overlaying images
from PIL import Image

import torch
# check if cuda is activated
cuda = torch.cuda.is_available()
if cuda == False:
    print("Model is NOT using GPU")
print ("Cuda:", torch.cuda.is_available())

class BonnetalNode:
    """
    Encapsulates the bonnetal functionality into a ROS node.
    """
    # A ROS subscriber for input images
    img_sub = None
    labelled_img_pub = None
    overlaid_img_pub = None
    # Bonnetal interface
    user = None

    def __init__(self):
        """
        Initializes ROS (pubs and subs) and bonnetal.
        """
        # Initialize ROS
        rospy.init_node("bonnetal_node")
        init = rospy.Time.now()
        # Parameters Config 
        path_model = rospy.get_param("path_model")
        backend = rospy.get_param("backend")
        camera_topic = rospy.get_param("camera_topic")

        # Add path for bonnetal files
        abs_path = rospy.get_param("abs_path")
        print ("Abs path is: ", abs_path)
        sys.path.insert(0, abs_path + "bonnetal/train")

        # Initialize bonnetal
        self.initialize_bonnetal(path=path_model, backend=backend)

        # Initialize publishers and subscribers
        self.overlaid_img_pub = rospy.Publisher("/overlaid_image/compressed",
                CompressedImage, queue_size = 1)
        self.labelled_img_pub = rospy.Publisher("/output_labelled_img/compressed",
                CompressedImage, queue_size = 1)
        # buff size allows callback to get the latest msg instead of queueing them
        self.img_sub = rospy.Subscriber(camera_topic,
                    CompressedImage, self.image_callback,  queue_size = 1, buff_size=2**32)

        rospy.loginfo("Segmentation node initialized in {} seconds!".format(
            (rospy.Time.now()-init).to_sec()))

    def initialize_bonnetal(self, path, backend="native", workspace=8000000000, calib_images=None):
        """
        Initializes bonnetal

        :type path: string
        :param path: full path to pretrained model

        :type backend: string
        :param backend: framework for segmentation task

        :type workspace: int
        :param workspace: max workspace size (only for TensorRT framework)

        :type calib_images: list
        :param calib_images: calibration images, must be a list of images (only for TensorRT framework)
        """
        # create inference context for the desired backend
        if backend == "tensorrt":
            # import and use tensorRT
            try:
                print("Using tensorRT")
                from tasks.segmentation.modules.userTensorRT import UserTensorRT
                self.user = UserTensorRT(path, workspace, calib_images)
            except ImportError as e:
                print ("ERROR:", e)
                sys.exit(0)
            except:
                print('\nERROR:TensorRT needs to use inference model type .onnx. You can make one '
                    'using tasks/segmentation/make_deploy_model.py')
                sys.exit(0)
        elif backend == "caffe2":
            try:
                # import and use caffe2
                print("Using caffe2")
                from tasks.segmentation.modules.userCaffe2 import UserCaffe2
                self.user = UserCaffe2(path)
            except ImportError as e:
                print ("ERROR:", e)
                sys.exit(0)
            except:
                print('\nERROR:Caffe2 needs to use inference model type .onnx. You can make one '
                    'using tasks/segmentation/make_deploy_model.py')
                sys.exit(0)

        elif backend == "pytorch":
            # import and use pytorch
            try:
                print("Using PyTorch")
                from tasks.segmentation.modules.userPytorch import UserPytorch
                self.user = UserPytorch(path)
            except ImportError as e:
                print ("ERROR:", e)
                sys.exit(0)
            except:
                print('\nERROR:PyTorch needs to use inference model type .pytorch. You can make one '
                    'using tasks/segmentation/make_deploy_model.py')
                sys.exit(0)

        else:
            # default to native pytorch
            print("Using native PyTorch")
            from tasks.segmentation.modules.user import User
            self.user = User(path)

    def segment_image(self, cv_img):
        """
        Input should be cv image.

        :type cv_img: int
        :param cv_img: max workspace size (only for TensorRT framework)

        :rtype: numpy.ndarray
        :returns: OpenCV color image with labels of fuel

        :rtype: numpy.ndarray
        :returns: OpenCV color image from the camera with overlay labels of fuel
        """
        # infer
        # print("Inferring ")
        _, lbl_img = self.user.infer(cv_img, False)
        overlay_img = Image.blend(Image.fromarray(cv_img), Image.fromarray(lbl_img), 0.5)

        return lbl_img, overlay_img

    def unpack_image_msg(self, msg):
        """
        Receives a sensor_msgs/CompressedImage and returns a cv image

        :type msg: CompressedImage
        :param msg: CompressedImage ROS message

        :rtype: numpy.ndarray
        :returns: OpenCV color image
        """
        np_arr = np.fromstring(msg.data, np.uint8)
        cv_img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)

        return cv_img

    def re_pack_image_msg(self, cv_img):
        """
        Packing OpenCV image to ROS message CompressedImage

        :type cv_img: CompressedImage
        :param cv_img: CompressedImage ROS message

        :rtype: CompressedImage
        :returns: CompressedImage ROS message in jpeg format
        """
        #img_msg = cv2_to_imgmsg(cv_img, encoding="bgr8")

        img_msg = CompressedImage()
        img_msg.header.stamp = rospy.Time.now()
        img_msg.format = "jpeg"
        img_msg.data = np.array(cv2.imencode('.jpg', np.asarray(cv_img))[1]).tostring()

        return img_msg

    def pub_lbl_img(self, cv_img):
        """
        Publishes the labelled (segmented) images.

        :type cv_img: CompressedImage
        :param cv_img: CompressedImage ROS message
        """
        img_msg = self.re_pack_image_msg(cv_img)
        self.labelled_img_pub.publish(img_msg)

    def pub_overlay_img(self, cv_img):
        """
        Publishes the overlaid images.

        :type cv_img: CompressedImage
        :param cv_img: CompressedImage ROS message
        """
        img_msg = self.re_pack_image_msg(cv_img)
        self.overlaid_img_pub.publish(img_msg)

    def image_callback(self, msg):
        """
        Receives sensor_msgs/CompressedImage and publishes labelled images.

        :type msg: CompressedImage
        :param msg: CompressedImage ROS message
        """
        cv_img = self.unpack_image_msg(msg)

        lbl_img, overlay_img = self.segment_image(cv_img)

        self.pub_lbl_img(lbl_img)
        self.pub_overlay_img(overlay_img)

    def run(self):
        """
        Enters the main loop for processing messages.
        """
        rospy.spin()

def main():
    node = BonnetalNode()
    node.run()

if __name__ == "__main__":
    main()

Do you know what the issue could be?

tano297 commented 4 years ago

Hi, This is likely a hardware problem. I would suggest 1) to check all of your power supply cables, and 2) checking dmesg for hardware problems. This is, however, nvidia-related, so I will close it here