Inference is too slow,is something wrong?

kindle-the-life commented 1 year ago

I wrote the interface for model inference following the logic of the predict function in the efficientad.py However, in actual testing, it was found that the average time was close to 1 second (excluding the model loading time), and it was also very slow after converting to the ONNX model

Device information： CPU： Intel(R) Core(TM) i7-10700K CPU @ 3.80GHz GPU： NVIDIA GeForce RTX 3080 16G

This is the code tested with the pth model：

import time
import numpy as np
import torch
from tqdm import tqdm

def pth_predict(image, teacher_model, student_model, ae_model, teacher_mean, teacher_std, out_channels,
                q_st_start=None, q_st_end=None, q_ae_start=None, q_ae_end=None):
    teacher_output = teacher_model(image)
    teacher_output = (teacher_output - teacher_mean) / teacher_std
    student_output = student_model(image)
    autoencoder_output = ae_model(image)
    map_st = torch.mean((teacher_output - student_output[:, :out_channels]) ** 2,
                        dim=1, keepdim=True)
    map_ae = torch.mean((autoencoder_output -
                         student_output[:, out_channels:]) ** 2,
                        dim=1, keepdim=True)
    if q_st_start is not None:
        map_st = 0.1 * (map_st - q_st_start) / (q_st_end - q_st_start)
    if q_ae_start is not None:
        map_ae = 0.1 * (map_ae - q_ae_start) / (q_ae_end - q_ae_start)
    map_combined = 0.5 * map_st + 0.5 * map_ae
    return map_combined, map_st, map_ae

if __name__ == '__main__':
    # Load the PTH model
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    teacher_net = torch.load('./output/ad_small/trainings/mvtec_ad/rain/teacher_final.pth', map_location=device)
    student_net = torch.load('./output/ad_small/trainings/mvtec_ad/rain/student_final.pth', map_location=device)
    ae_net = torch.load('./output/ad_small/trainings/mvtec_ad/rain/autoencoder_final.pth', map_location=device)

    # Construct the input data
    fake_img_tensor = torch.rand((1, 3, 256, 256))

    output_channels_num = 384
    # Model prediction
    teacher_mean_tensor = torch.rand((1, output_channels_num, 1, 1))
    teacher_std_tensor = torch.rand((1, output_channels_num, 1, 1))

    time_range = 100
    time_cost_list = []
    for i in tqdm(range(time_range)):
        s1 = time.time()
        pth_predict(fake_img_tensor, teacher_net, student_net, ae_net, teacher_mean_tensor, teacher_std_tensor,
                    output_channels_num,
                    q_st_start=None, q_st_end=None, q_ae_start=None, q_ae_end=None)
        s2 = time.time()
        time_cost_list.append(s2 - s1)
    print(f'average time cost:{np.mean(time_cost_list):.6f}s')

This is the test code for inference with the ONNX model：

import time
import numpy as np
import onnxruntime
from tqdm import tqdm

def onnx_predict(img_arr, teacher_session, student_session, ae_session, teacher_mean, teacher_std, out_channels,
                 q_st_start=None, q_st_end=None, q_ae_start=None, q_ae_end=None):
    ort_inputs1 = {teacher_session.get_inputs()[0].name: img_arr}
    teacher_output = teacher_session.run(None, ort_inputs1)
    teacher_output = teacher_output[0]
    teacher_output = (teacher_output - teacher_mean) / teacher_std

    ort_inputs2 = {student_session.get_inputs()[0].name: img_arr}
    student_output = student_session.run(None, ort_inputs2)
    student_output = student_output[0]

    ort_inputs3 = {ae_session.get_inputs()[0].name: img_arr}
    autoencoder_output = ae_session.run(None, ort_inputs3)
    autoencoder_output = autoencoder_output[0]

    map_st = np.mean((teacher_output - student_output[:, :out_channels]) ** 2, axis=1)
    map_ae = np.mean((autoencoder_output -
                      student_output[:, out_channels:]) ** 2, axis=1)
    if q_st_start is not None:
        map_st = 0.1 * (map_st - q_st_start) / (q_st_end - q_st_start)
    if q_ae_start is not None:
        map_ae = 0.1 * (map_ae - q_ae_start) / (q_ae_end - q_ae_start)
    map_combined = 0.5 * map_st + 0.5 * map_ae

    return map_combined, map_st, map_ae

if __name__ == '__main__':
    # Load the ONNX model
    teacher_ort_session = onnxruntime.InferenceSession('./output/onnx_path/teacher.onnx')
    student_ort_session = onnxruntime.InferenceSession('./output/onnx_path/student.onnx')
    ae_ort_session = onnxruntime.InferenceSession('./output/onnx_path/autoencoder.onnx')

    # Construct the input data
    fake_img_arr = np.random.rand(1, 3, 256, 256)
    fake_img_arr = fake_img_arr.astype(np.float32)
    output_channels_num = 384
    # Model prediction
    teacher_mean_arr = np.random.rand(1, output_channels_num, 1, 1)
    teacher_std_arr = np.random.rand(1, output_channels_num, 1, 1)

    time_range = 100
    time_cost_list = []
    for i in tqdm(range(time_range)):
        s1 = time.time()
        onnx_predict(fake_img_arr, teacher_ort_session, student_ort_session, ae_ort_session,
                     teacher_mean=teacher_mean_arr,
                     teacher_std=teacher_std_arr, out_channels=output_channels_num,
                     q_st_start=None, q_st_end=None, q_ae_start=None, q_ae_end=None)
        s2 = time.time()
        time_cost_list.append(s2 - s1)
    print(f'average time cost:{np.mean(time_cost_list):.6f}s')

This is a function of converting the PTH model to the ONNX model：

def convert_to_onnx_with_dynamic_img_shape(model, input_size, onnx_path):
    model.eval()

    dummy_input = torch.randn(1, *input_size, requires_grad=True)

    torch.onnx.export(model, 
                      dummy_input, 
                      onnx_path,  
                      export_params=True,  
                      opset_version=11,  
                      do_constant_folding=True,  
                      input_names=['modelInput'],  
                      output_names=['modelOutput'],  
                      dynamic_axes={'modelInput': {0: 'batch_size', 2: 'img_height', 3: 'img_weight'}, 
                                    'modelOutput': {0: 'batch_size', 2: 'img_height', 3: 'img_weight'}})
    print('Model has been converted to ONNX')

if __name__ == '__main__':
    out_channels = 384

    # teacher = get_pdn_small(out_channels)
    # student = get_pdn_small(2 * out_channels)
    # autoencoder = get_autoencoder(out_channels)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    teacher_model = torch.load('./output/ad_small/trainings/mvtec_ad/rain/teacher_final.pth',
                               map_location=device)
    student_model = torch.load('./output/ad_small/trainings/mvtec_ad/rain/student_final.pth',
                               map_location=device)
    autoencoder_model = torch.load('./output/ad_small/trainings/mvtec_ad/rain/autoencoder_final.pth',
                                   map_location=device)

    teacher_onnx_path = './output/onnx_path/teacher.onnx'
    convert_to_onnx_with_dynamic_img_shape(teacher_model, input_size=(3, 256, 256), onnx_path=teacher_onnx_path)

    student_onnx_path = './output/onnx_path/student.onnx'
    convert_to_onnx_with_dynamic_img_shape(student_model, input_size=(3, 256, 256), onnx_path=student_onnx_path)

    autoencoder_onnx_path = './output/onnx_path/autoencoder.onnx'
    convert_to_onnx_with_dynamic_img_shape(autoencoder_model, input_size=(3, 256, 256), onnx_path=autoencoder_onnx_path)

kywish commented 1 year ago

My test result is 30ms（image preprocess+model infer+ resize to original size) , and onnxruntime took 32ms. Environment：(notebook) i9-12900H, RTX 2050 4G.

kindle-the-life commented 1 year ago

谢谢，已收到您的来信，我将在最快的时间给您答复。朱雷

hengyanchen commented 10 months ago

@kindle-the-life I found your code only work on CPU, because the input data you created did not transfer to GPU. I hope it's could help you

kindle-the-life commented 10 months ago

谢谢，已收到您的来信，我将在最快的时间给您答复。朱雷

nelson1425 / EfficientAD

Inference is too slow,is something wrong? #9