[Bug] Trying to import BERT Transformer onto TVM

Expected behavior

I'm currently trying to import the BERT Transformer onto TVM but I'm running into issues with the tvm.relay.from_pytorch function.

Actual behavior

Line of code causing the issue:

mod, params = relay.frontend.from_pytorch(script_module, input_infos)

Environment

Linux OS, TVM version 0.19.dev (built from source), PyTorch version 2.4.0+cu121, Transformers version 4.39.3, CUDA version 12.4

Steps to reproduce

Here's the initial code that I was trying to use that I found online:

from transformers import BertModel, BertTokenizer, BertConfig
import torch
import numpy as np
import tvm
from tvm import relay
from tvm.contrib import graph_executor

###############################
# change your config here
n_trails = 2000             # higher is better. 
n_early_stopping = 600      # higher is better. 
set_seqlen_myself = False   # if set to be true, the model will use the seq_len you set below
seq_len = 512               # only take effect when set_seqlen_myself = True
target = tvm.target.cuda()
##############################

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
device = torch.device("cuda")

# Tokenizing input text
if set_seqlen_myself:
    input_ids = list(np.random.randint(0,25000,seq_len))
    input_ids[0] = 102
    input_ids[-1] = 103
    atten_mask = list(np.ones(seq_len,dtype=int))
    token_type_ids = list(np.zeros(seq_len,dtype=int))
else:
    sentence_a = "Who was Jim Henson ?"
    sentence_b = "Jim Henson was a puppeteer."
    tokenized_text = tokenizer(sentence_a,sentence_b,padding='max_length') # will expand to 512 length
    input_ids = tokenized_text['input_ids']
    atten_mask = tokenized_text['attention_mask']
    token_type_ids = tokenized_text['token_type_ids']

seq_len = len(input_ids)

# Creating a dummy input
input_ids_tensor = torch.tensor([input_ids])
atten_mask_tensors = torch.tensor([atten_mask])
token_type_ids_tensors = torch.tensor([token_type_ids])

dummy_input = [input_ids_tensor, atten_mask_tensors,token_type_ids_tensors] 

# Initializing the model with the torchscript flag
# Flag set to True even though it is not necessary as this model does not have an LM Head.
config = BertConfig(torchscript=True)

# Instantiating the model
model = BertModel(config)

# The model needs to be in evaluation mode
model.eval()

# If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)

# Creating the trace
traced_model = torch.jit.trace(model, dummy_input)
traced_model.eval()
# torch.jit.save(traced_model, "traced_bert.pt")

script_module = traced_model
input_infos = [("input_ids",((1,seq_len),"int")),("attention_mask",((1,seq_len),"int")),("token_type_ids",((1,seq_len),"int"))]
mod, params = relay.frontend.from_pytorch(script_module, input_infos)

#######################################
# compile on cuda
print("############################")
print("Deploy on CUDA, build the relay.")
target = tvm.target.cuda()

with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target=target, params=params)

dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))

######################################
# TVM runtime 
print("#############################")
print("TVM runtime")
dtype = "float32"

input_ids = np.array(input_ids)
input_ids = input_ids[np.newaxis,...]
atten_mask = np.array(atten_mask)
atten_mask = atten_mask[np.newaxis,...]
token_type_ids = np.array(token_type_ids)
token_type_ids = token_type_ids[np.newaxis,...]

module.set_input("input_ids", input_ids,attention_mask=atten_mask,token_type_ids=token_type_ids)

module.run()
output_shape1 = (1, seq_len, 768)
output_shape2 = (1, 768)
tvm_output1 = module.get_output(0, tvm.nd.empty(output_shape1)).numpy()
tvm_output2 = module.get_output(1, tvm.nd.empty(output_shape2)).numpy()

######################################
# Collect Basic Performance Data
# ------------------------------
# We want to collect some basic performance data associated with this
# unoptimized model and compare it to a tuned model later. 
import timeit
print("############################")
print("measure unoptimized performance in ms")

timing_number = 10
timing_repeat = 10
unoptimized = (
    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
    * 1000
    / timing_number
)
unoptimized = {
    "mean": np.mean(unoptimized),
    "median": np.median(unoptimized),
    "std": np.std(unoptimized),
}

print(unoptimized)

######################################
# Tune the model
# In the simplest form, tuning requires three things:
#
# - the target specification of the device you intend to run this model on
# - the path to an output file in which the tuning records will be stored
# - a path to the model to be tuned.
#

import tvm.auto_scheduler as auto_scheduler
from tvm.autotvm.tuner import XGBTuner
from tvm import autotvm

print("##############################")
print("Auto Tuning cuda")

target = tvm.target.cuda()

number = 4
repeat = 3
min_repeat_ms = 150     # Using min_repeat_ms can dynamically adjusts number, so it is recommended. 
            #The typical value for NVIDIA GPU is 150 ms.
timeout = 4  # in seconds

# create a TVM runner
runner = autotvm.LocalRunner(
    number=number,
    repeat=repeat,
    timeout=timeout,
    min_repeat_ms=min_repeat_ms,
)

tuning_option = {
    "tuner": "xgb",
    "trials": n_trails,
    "early_stopping": n_early_stopping,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(build_func="default"), runner=runner
    ),
    "tuning_records": "bert-cuda-autotuning.json",
}

tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)

# Tune the extracted tasks sequentially.
for i, task in enumerate(tasks):
    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
    tuner_obj = XGBTuner(task, loss_type="rank")
    tuner_obj.tune(
        n_trial=min(tuning_option["trials"], len(task.config_space)),
        early_stopping=tuning_option["early_stopping"],
        measure_option=tuning_option["measure_option"],
        callbacks=[
            autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
            autotvm.callback.log_to_file(tuning_option["tuning_records"]),
        ],
    )

#######################################
# Compiling an Optimized Model with Tuning Data
# ----------------------------------------------
#
# As an output of the tuning process above, we obtained the tuning records
# stored in json. The compiler will use the results to
# generate high performance code for the model on your specified target.
#
# Now that tuning data for the model has been collected, we can re-compile the
# model using optimized operators to speed up our computations.

with autotvm.apply_history_best(tuning_option["tuning_records"]):
    with tvm.transform.PassContext(opt_level=3, config={}):
        lib = relay.build(mod, target=target, params=params)

dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))

#####################################
# Comparing the Tuned and Untuned Models
print("###############################")
print("measure optimized performance in ms.")

timing_number = 10
timing_repeat = 10
optimized = (
    np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number))
    * 1000
    / timing_number
)
optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std": np.std(optimized)}

print("optimized: %s" % (optimized))
print("unoptimized: %s" % (unoptimized))

with open("result.txt","a") as f:
    f.write("bert cuda:\n")
    f.write("optimized: %s\n" % (optimized))
    f.write("unoptimized: %s\n" % (unoptimized))

And here is the error message that I am receiving:

/home/okancha/csc-591-genai/tvm/python/tvm/target/target.py:446: UserWarning: Try specifying cuda arch by adding 'arch=sm_xx' to your target.
  warnings.warn("Try specifying cuda arch by adding 'arch=sm_xx' to your target.")
[18:54:33] /home/okancha/csc-591-genai/tvm/src/target/target_kind.cc:171: Warning: Unable to detect CUDA version, default to "-arch=sm_50" instead
/home/okancha/.local/lib/python3.10/site-packages/transformers/modeling_utils.py:4225: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead
  warnings.warn(
Traceback (most recent call last):
  File "/home/okancha/csc-591-genai/tvm_bert.py", line 67, in <module>
    mod, params = relay.frontend.from_pytorch(script_module, input_infos)
  File "/home/okancha/csc-591-genai/tvm/python/tvm/relay/frontend/pytorch.py", line 5436, in from_pytorch
    outputs = converter.convert_operators(operator_nodes, outputs, ret_name)
  File "/home/okancha/csc-591-genai/tvm/python/tvm/relay/frontend/pytorch.py", line 4576, in convert_operators
    relay_out = relay_op(
  File "/home/okancha/csc-591-genai/tvm/python/tvm/relay/frontend/pytorch.py", line 1819, in linear
    mm_out = self.matmul(
  File "/home/okancha/csc-591-genai/tvm/python/tvm/relay/frontend/pytorch.py", line 2048, in matmul
    a = _op.broadcast_to(a, batch_shape + list(a_shape[-2:]))
  File "/home/okancha/csc-591-genai/tvm/python/tvm/relay/op/transform.py", line 868, in broadcast_to
    return _make.broadcast_to(data, shape)
  File "/home/okancha/csc-591-genai/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 245, in __call__
    raise_last_ffi_error()
  File "/home/okancha/csc-591-genai/tvm/python/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
    raise py_err
tvm._ffi.base.TVMError: Traceback (most recent call last):
  2: _ZN3tvm7runtime13PackedFuncObj
  1: tvm::runtime::TypedPackedFunc<tvm::RelayExpr (tvm::RelayExpr, tvm::runtime::Array<tvm::Integer, void>)>::AssignTypedLambda<tvm::RelayExpr (*)(tvm::RelayExpr, tvm::runtime::Array<tvm::Integer, void>)>(tvm::RelayExpr (*)(tvm::RelayExpr, tvm::runtime::Array<tvm::Integer, void>), std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
  0: tvm::runtime::TVMMovableArgValueWithContext_::operator tvm::runtime::Array<tvm::Integer, void><tvm::runtime::Array<tvm::Integer, void> >() const
  5: _ZN3tvm7runtime13PackedFuncObj
  4: tvm::runtime::TypedPackedFunc<tvm::RelayExpr (tvm::RelayExpr, tvm::runtime::Array<tvm::Integer, void>)>::AssignTypedLambda<tvm::RelayExpr (*)(tvm::RelayExpr, tvm::runtime::Array<tvm::Integer, void>)>(tvm::RelayExpr (*)(tvm::RelayExpr, tvm::runtime::Array<tvm::Integer, void>), std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
  3: tvm::runtime::TVMMovableArgValueWithContext_::operator tvm::runtime::Array<tvm::Integer, void><tvm::runtime::Array<tvm::Integer, void> >() const
  2: tvm::runtime::ObjectPtr<tvm::runtime::Object> tvm::runtime::Array<tvm::runtime::ObjectRef, void>::MapHelper<tvm::runtime::PackedFuncValueConverter<tvm::runtime::Array<tvm::Integer, void> >::From(tvm::runtime::TVMArgValue const&)::{lambda(tvm::runtime::ObjectRef)#1}, tvm::Integer>(tvm::runtime::ObjectPtr<tvm::runtime::Object>, tvm::runtime::PackedFuncValueConverter<tvm::runtime::Array<tvm::Integer, void> >::From(tvm::runtime::TVMArgValue const&)::{lambda(tvm::runtime::ObjectRef)#1})
  1: tvm::Integer tvm::runtime::PackedFuncValueConverter<tvm::Integer>::From<tvm::runtime::TVMArgValue>(tvm::runtime::TVMArgValue const&)
  0: tvm::Integer tvm::runtime::TVMPODValue_CRTP_<tvm::runtime::TVMArgValue>::AsObjectRef<tvm::Integer>() const
  File "/home/okancha/csc-591-genai/tvm/include/tvm/runtime/packed_func.h", line 924
TVMError: In function relay.op._make.broadcast_to(0: RelayExpr, 1: Array<IntImm>) -> RelayExpr: error while converting argument 1: [18:54:36] /home/okancha/csc-591-genai/tvm/include/tvm/runtime/packed_func.h:2282: InternalError: Check failed: (!checked_type.defined()) is false: Expected IntImm, but got tir.Any

Any help would be greatly appreciated. Thanks!

Triage

Please refer to the list of label tags here to find the relevant tags and add them below in a bullet format (example below).

needs-triage

I think that's the right tag, but I'm not sure.

apache / tvm