DJLModel inference error - Allocation larger than expected: tag 'qkv'

Describe the bug A clear and concise description of what the bug is.

To reproduce Code to reproduce:

Deploy model first:

from sagemaker.djl_inference import DJLModel

djl_model = DJLModel(
    "EleutherAI/gpt-j-6b", 
    "my_sagemaker_role",
    dtype="fp16",
    task="text-generation",
    number_of_partitions=4
)
predictor = djl_model.deploy("ml.g4dn.12xlarge",
                            initial_instance_count=1)

1) When input token is less than 1024 (OK)

from sagemaker.huggingface.model import HuggingFacePredictor
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b")

def get_token_length(text, tokenizer):
    inputs = tokenizer(text)
    return len(inputs.input_ids)

def call_endpoint(text, endpoint_name, tokenizer, parameters):
    llm = HuggingFacePredictor(endpoint_name=endpoint_name)
    payload = {"inputs": text, "parameters": parameters}
    response = llm.predict(payload)
    return  response[0]["generated_text"]

parameters = {
    "max_new_tokens": 300,
    "max_length": 2048,
    "do_sample": False,
    "temperature": 0.1,
    "top_p": 0.1,
    "pad_token_id": 50256,
    "use_cache": True,
}

text1 = """This is a test""" * 200
print("input tokens", get_token_length(text1, tokenizer))
temp = call_endpoint(text1, endpoint_name, tokenizer, parameters)
print(temp)
print("input tokens", get_token_length(temp, tokenizer))

Output:

input tokens 800
output text...
input tokens 1025

2) When input token is > 1024 (ERROR)

text1 = """This is a test""" * 300
print("input tokens", get_token_length(text1, tokenizer))
temp = call_endpoint(text1, endpoint_name, tokenizer, parameters)
print(temp)
print("input tokens", get_token_length(temp, tokenizer))

Output:

input tokens 1200

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (424) from primary with message "{
  "code":424,
  "message":"prediction failure",
  "error":"Allocation larger than expected: tag 'qkv', requested size: 7372800, expected max size:  '6291456'"
}"

Cloudwatch:

#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:DeepSpeed inference failed
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:Traceback (most recent call last):
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/tmp/.djl.ai/python/0.22.1/djl_python/deepspeed.py", line 300, in inference
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    output_tokens = self.model.generate(
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/deepspeed/inference/engine.py", line 645, in _generate
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    return self.module.generate(*inputs, **kwargs)
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    return func(*args, **kwargs)
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/transformers/generation/utils.py", line 1515, in generate
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    return self.greedy_search(
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/transformers/generation/utils.py", line 2332, in greedy_search
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    outputs = self(
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    return forward_call(*args, **kwargs)
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/transformers/models/gptj/modeling_gptj.py", line 853, in forward
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    transformer_outputs = self.transformer(
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    return forward_call(*args, **kwargs)
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/transformers/models/gptj/modeling_gptj.py", line 688, in forward
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    outputs = block(
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    return forward_call(*args, **kwargs)
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/deepspeed/model_implementations/transformers/ds_transformer.py", line 268, in forward
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    output, presents = self.compute_transformer_block(input, input_mask, head_mask, layer_past, get_present,
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/deepspeed/model_implementations/transformers/ds_transformer.py", line 100, in compute_transformer_block
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    self.attention(input,
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    return forward_call(*args, **kwargs)
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/deepspeed/ops/transformer/inference/ds_attention.py", line 153, in forward
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    qkv_out = self.qkv_func(input=input,
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    return forward_call(*args, **kwargs)
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:  File "/usr/local/lib/python3.9/dist-packages/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py", line 41, in forward
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:    output, norm = self.qkv_gemm_func(input, weight, q_scale, bias, gamma, beta, self.config.epsilon,
#033[32mINFO #033[m #033[92mPyProcess#033[m [1,0]<stdout>:RuntimeError: Allocation larger than expected: tag 'qkv', requested size: 7372800, expected max size:  '6291456'

Expected behavior The model should respect the parameter max_length=2048, instead of 1024.

Screenshots or logs If applicable, add screenshots or logs to help explain your problem.

System information A description of your system. Please provide:

SageMaker Python SDK version: 2.173.0
Framework name (eg. PyTorch) or algorithm (eg. KMeans): DJLModel
Framework version: NA
Python version: NA
CPU or GPU: GPU
Custom Docker image (Y/N): N

Additional context Add any other context about the problem here.

aws / sagemaker-python-sdk

DJLModel inference error - Allocation larger than expected: tag 'qkv' #4020