microsoft / onnxruntime

ONNX Runtime: cross-platform, high performance ML inferencing and training accelerator
https://onnxruntime.ai
MIT License
14.52k stars 2.91k forks source link

Session's inner variables not refreshed between 2 runs #18742

Open RobinGRAPIN opened 10 months ago

RobinGRAPIN commented 10 months ago

Describe the issue

Running an Ort Session in python two times leads to an error, which is always about an index out of range somewhere in the operations. It makes me think that it is caused by a variable in a "for loop" inside the graph that is not reset between the two runs.

ort_session = ort.InferenceSession(model_path, providers=['CPUExecutionProvider'])
outputs = ort_session.run(None, inputs) # OK 
outputs = ort_session.run(None, inputs) # Error

I tried with several networks and I obtain this kind of error during the second run() : InvalidArgument: [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Non-zero status code returned while running Gather node. Name:'/sa1/Gather_136' Status Message: indices element out of data bounds, idx=100 must be within the inclusive range [-100,99]

The input for inference can even be the same than the one used for tracing.

For some networks, an interesting thing that I mentioned is that exporting it using dynamic_axes removes this problem, as if beeing exported this way allow to empty a kind of 'cache' in the inner variables of the model.

To reproduce

Dockerfile

FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel RUN pip install onnx RUN pip install onnxruntime

Minimal code

Architecture
import torch
import torch.nn as nn
import torch.nn.functional as F

def index_points(points, idx):
    device = points.device
    B,N,C = points.shape
    view_shape = list(idx.shape)
    view_shape = [view_shape[0]]+[1] * (len(view_shape) - 1) # je mets ca pour le scriptage
    repeat_shape = list(idx.shape)
    repeat_shape[0] = 1
    batch_indices = torch.arange(B, dtype=torch.long).to(device).view(view_shape).repeat(repeat_shape)
    new_points = points[batch_indices, idx, :]
    return new_points

def farthest_point_sample(xyz, npoint : int):
    device = xyz.device
    B, N, C = xyz.shape
    centroids = torch.zeros(B, npoint, dtype=torch.long).to(device)
    distance = torch.ones(B, N).to(device) * 1e10
    farthest = torch.zeros((B), dtype=torch.long).to(device)
    batch_indices = torch.arange(B, dtype=torch.long).to(device)
    for i in range(npoint):
        centroids[:, i] = farthest
        centroid = xyz[batch_indices, farthest, :].view(B, 1, 3)
        dist = torch.sum((xyz - centroid) ** 2, -1)
        distance = torch.where(dist < distance, dist, distance) 
        farthest = torch.max(distance, -1)[1]
    return centroids

def square_distance(src, dst):
    B, N, _ = src.shape
    _, M, _ = dst.shape
    dist = -2 * torch.matmul(src, dst.permute(0, 2, 1))
    dist = dist + torch.sum(src ** 2, -1).view(B, N, 1)
    dist = dist + torch.sum(dst ** 2, -1).view(B, 1, M)
    return dist

def query_ball_point(radius : float, nsample : int, xyz, new_xyz):
    device = xyz.device
    B, N, C = xyz.shape
    _, S, _ = new_xyz.shape
    group_idx = torch.arange(N, dtype=torch.long).to(device).view(1, 1, N).repeat([B, S, 1])
    sqrdists = square_distance(new_xyz, xyz)
    group_idx = torch.where(sqrdists > radius**2, torch.tensor(N).to(device), group_idx)# mien
    group_idx = group_idx.sort(dim=-1)[0][:, :, :nsample]
    group_first = group_idx[:, :, 0].view(B, S, 1).repeat([1, 1, nsample])
    group_idx = torch.where(group_idx == N, group_first, group_idx)# mien
    return group_idx

class PointNetSetAbstractionMs(nn.Module):
    def __init__(self, npoint :int , radius_list, nsample_list, in_channel, mlp_list):
        super().__init__()
        self.npoint = npoint
        self.radius_list = radius_list
        self.nsample_list = nsample_list
        self.conv_blocks = nn.ModuleList()
        self.bn_blocks = nn.ModuleList()
        for i in range(len(mlp_list)):
            convs = nn.ModuleList()
            bns = nn.ModuleList()
            last_channel = in_channel + 3
            for out_channel in mlp_list[i]:
                convs.append(nn.Conv2d(last_channel, out_channel, 1))
                bns.append(nn.BatchNorm2d(out_channel))
                last_channel = out_channel
            self.conv_blocks.append(convs)
            self.bn_blocks.append(bns)

    def forward(self, xyz):
        B, N, C = xyz.shape
        new_xyz = index_points(xyz, farthest_point_sample(xyz, self.npoint))
        new_points_list = []
        for i, (convi, bnormi) in enumerate(zip(self.conv_blocks, self.bn_blocks)):
            group_idx = query_ball_point(self.radius_list[i], self.nsample_list[i], xyz, new_xyz)
            grouped_xyz = index_points(xyz, group_idx)
            grouped_xyz -= new_xyz.view(B, self.npoint, 1, C)
            grouped_points = grouped_xyz
            grouped_points = grouped_points.permute(0, 3, 2, 1)  # [B, feature_size+3, nsample, S=npoint]

            for (convij, bnormij) in zip(convi, bnormi):
                grouped_points =  F.relu(bnormij(convij(grouped_points)))
            new_points = torch.max(grouped_points, 2)[0]  # [B, feature_size+3, S]
            new_points_list.append(new_points)

        new_points_concat = torch.cat(new_points_list, dim=1)
        return new_xyz, new_points_concat

class pointnetpp_encoder(nn.Module):
    """Small pointcloud size for quick test"""
    def __init__(self):
        super().__init__()
        self.sa1 = PointNetSetAbstractionMs(32, [0.1, 0.5], [4,8], 0, [[16, 16, 32], [32, 32, 64]])
        self.sa2 = PointNetSetAbstractionMs(16, [0.5, 1.0], [4,8], 96, [[64, 64, 128], [64, 96, 128]])
        self.sa3 = PointNetSetAbstractionMs(8 , [1.0, 2.0], [4,8], 256, [[128, 196, 256], [128, 196, 256]])
        self.sa4 = PointNetSetAbstractionMs(8 , [2.0, 4.0], [4,8], 512, [[256, 256, 512], [256, 384, 512]])

    def forward(self, xyz):
        l1_xyz, l1_points = self.sa1(xyz)
        l2_xyz, l2_points = self.sa2(l1_xyz, l1_points)
        l3_xyz, l3_points = self.sa3(l2_xyz, l2_points)
        l4_xyz, l4_points = self.sa4(l3_xyz, l3_points)
        return l4_xyz, l4_points

export / import code

import onnxruntime as ort
model = pointnetpp_encoder().eval()
input = 10*torch.randn(1,100,3)
torch.onnx.export(model , input, "test.onnx")
ort_session = ort.InferenceSession("test.onnx", providers=['CPUExecutionProvider'])
names = [input.name  for input in ort_session.get_inputs()]
inputs = {n : input[i].cpu().numpy() for i,n in enumerate(names)} if len(names)>1 else {names[0]: input.cpu().numpy()} 
outputs = ort_session.run(None, inputs)
outputs = ort_session.run(None, inputs) # error
"inner variables reset" with dynamic axes
torch.onnx.export(model , input, "test.onnx", input_names = ["xyz"], dynamic_axes = {'xyz' : {0 : 'batch_size', 1:"n_points"}})
ort_session = ort.InferenceSession("test.onnx", providers=['CPUExecutionProvider'])
names = [input.name  for input in ort_session.get_inputs()]
inputs = {n : input[i].cpu().numpy() for i,n in enumerate(names)} if len(names)>1 else {names[0]: input.cpu().numpy()} 
outputs = ort_session.run(None, inputs)
outputs = ort_session.run(None, inputs) # ok

Urgency

No response

Platform

Linux

OS Version

1 SMP Thu Aug 31 10:29:22 EDT 2023

ONNX Runtime Installation

Built from Source

ONNX Runtime Version or Commit ID

1.16.1

ONNX Runtime API

Python

Architecture

X64

Execution Provider

Default CPU

Execution Provider Library Version

No response

YUNQIUGUO commented 10 months ago

related error message: Non-zero status code returned while running Gather node. Name:'/sa1/Gather_136' Status Message: indices element out of data bounds, idx=100 must be within the inclusive range [-100,99]

RobinGRAPIN commented 10 months ago

Yes but the error message happens only during the second Run, as if idx hadn't been reset to 0 at the end of the first inference.

github-actions[bot] commented 9 months ago

This issue has been automatically marked as stale due to inactivity and will be closed in 30 days if no further activity occurs. If further support is needed, please provide an update and/or more details.

RobinGRAPIN commented 9 months ago

Still not solved

github-actions[bot] commented 8 months ago

This issue has been automatically marked as stale due to inactivity and will be closed in 30 days if no further activity occurs. If further support is needed, please provide an update and/or more details.

GrigoryEvko commented 8 months ago

It's not solved, I'm running onnxruntime 1.18.0 dev with TRT backend, and this error is still here, unfortunately. For the first time everything goes OK, but for the second time it throws an error.

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[34], line 7
      5 face_image = input_image
      6 app.prepare(ctx_id=0, det_size=(640, 640))
----> 7 face_info = app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR))
      8 face_info = sorted(
      9     face_info,
     10     key=lambda x:
     11     (x['bbox'][2] - x['bbox'][0]) * (x['bbox'][3] - x['bbox'][1]))[
     12         -1]  # only use the maximum face
     13 face_emb = face_info['embedding']

File ~/anaconda3/envs/diffusion/lib/python3.11/site-packages/insightface/app/face_analysis.py:59, in FaceAnalysis.get(self, img, max_num)
     58 def get(self, img, max_num=0):
---> 59     bboxes, kpss = self.det_model.detect(img,
     60                                          max_num=max_num,
     61                                          metric='default')
     62     if bboxes.shape[0] == 0:
     63         return []

File ~/anaconda3/envs/diffusion/lib/python3.11/site-packages/insightface/model_zoo/retinaface.py:224, in RetinaFace.detect(self, img, input_size, max_num, metric)
    221 det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
    222 det_img[:new_height, :new_width, :] = resized_img
--> 224 scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)
    226 scores = np.vstack(scores_list)
    227 scores_ravel = scores.ravel()

File ~/anaconda3/envs/diffusion/lib/python3.11/site-packages/insightface/model_zoo/retinaface.py:158, in RetinaFace.forward(self, img, threshold)
    156 fmc = self.fmc
    157 for idx, stride in enumerate(self._feat_stride_fpn):
--> 158     scores = net_outs[idx]
    159     bbox_preds = net_outs[idx+fmc]
    160     bbox_preds = bbox_preds * stride

IndexError: list index out of range
GrigoryEvko commented 8 months ago

I traced down this error to happen exclusively with TensorrtExecutionProvider with trt_cuda_graph_enable on 1.18.0dev version. Everything without cuda graph works fine, including CPUExecutionProvider, CUDAExecutionProvider (with no options though) and TensorrtExecutionProvider with fp16, engine and timing cache. Pretty strange behaviour.