Dynamic batching in inference doesn't work when embedding layers are included and input is two tensors

import torch
import torch_neuron
from transformers import CLIPModel

# Torch wrapper to isolate the text tower
class ClipTextEncoder(torch.nn.Module):
    def __init__(self, model):
        super(ClipTextEncoder, self).__init__()
        self.model = model

    def forward(self, x: torch.Tensor, y: torch.Tensor):
        x = self.model.get_text_features(input_ids=x, attention_mask=y)
        return x

def export_to_neuron(dummy_input, model):
    if not isinstance(dummy_input, list):
        dummy_input = [dummy_input]

    torch.neuron.analyze_model(model, example_inputs=dummy_input)
    model_neuron = torch.neuron.trace(model, example_inputs=dummy_input, separate_weights=True, dynamic_batch_size=True, fallback=False)

    return model_neuron

if __name__ == "__main__":

    # Load from pretrained clip
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

    # Input information
    text_num_tokens = 77
    device = "cpu"

    # create our model to convert
    text_encoder = ClipTextEncoder(model)

    # Lets define inputs. Torch model expect to see torch tensors as input
    dummy_text = torch.randint(low=0, high=512, size=(1, text_num_tokens)).to(device)
    dummy_mask = torch.randint(low=0, high=2, size=(dummy_text).size()).to(device)

    # Converting to neuron the two encoders
    neuron_text_encoder = export_to_neuron([dummy_text, dummy_mask], text_encoder)

    dummy_text = torch.randint(low=0, high=512, size=(1, text_num_tokens)).to(device)
    dummy_mask = torch.randint(low=0, high=2, size=(dummy_text).size()).to(device)
    text_embeddings = neuron_text_encoder(dummy_text, dummy_mask)
    print(f"Computing text embeddings of shape {text_embeddings.shape[1]} with neuron, batch size {text_embeddings.shape[0]}")

    dummy_text = torch.randint(low=0, high=512, size=(2, text_num_tokens)).to(device)
    dummy_mask = torch.randint(low=0, high=2, size=(dummy_text).size()).to(device)
    text_embeddings = neuron_text_encoder(dummy_text, dummy_mask)
    print(f"Computing text embeddings of shape {text_embeddings.shape[1]} with neuron, batch size {text_embeddings.shape[0]}")

Hello, I am the guy from https://github.com/aws-neuron/aws-neuron-sdk/issues/861 . This time I am actually trying to export to neuron the text tower of a clip model, this is a more special case because it receives two tensors as input (one is the input_ids and the other is an attention_mask). As suggested in https://github.com/aws-neuron/aws-neuron-sdk/issues/609 I've set dynamic_batch_size=True, fallback=False but this raise the following error:

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:Neuron:PyTorch to TF conversion failed to resolve function on aten::index with inputs [<tf.Tensor 'CLIPTextTransformer_4/LayerNorm_117/aten_layer_norm/batchnorm/add_1:0' shape=(1, 77, 512) dtype=float32>, [<tf.Tensor 'CLIPTextTransformer_4/aten_arange_1/range:0' shape=(1,) dtype=int32>, <tf.Tensor 'CLIPTextTransformer_4/aten_argmax/ArgMax:0' shape=(1,) dtype=int32>]]
INFO:Neuron:Exception = Don't support multiple lists of indices in a row in the beginning
Traceback (most recent call last):
  File "dummy_test_text.py", line 43, in <module>
    neuron_text_encoder = export_to_neuron([dummy_text, dummy_mask], text_encoder)
  File "dummy_test_text.py", line 21, in export_to_neuron
    model_neuron = torch.neuron.trace(model, example_inputs=dummy_input, separate_weights=True, dynamic_batch_size=True, fallback=False)
  File "/home/ubuntu/my_repo/venv/lib/python3.8/site-packages/torch_neuron/convert.py", line 154, in trace
    runnable = cu.subgraph_compiler(converted, example_inputs, **compile_kwargs)
  File "/home/ubuntu/my_repo/venv/lib/python3.8/site-packages/torch_neuron/decorators.py", line 81, in trace
    transform_torch_graph_to_tensorflow(jit_trace, example_inputs, separate_weights=separate_weights, neuron_graph=func, **kwargs)
  File "/home/ubuntu/my_repo/venv/lib/python3.8/site-packages/torch_neuron/decorators.py", line 634, in transform_torch_graph_to_tensorflow
    raise e
  File "/home/ubuntu/my_repo/venv/lib/python3.8/site-packages/torch_neuron/decorators.py", line 628, in transform_torch_graph_to_tensorflow
    tensor_outputs = local_func(op, *tensor_inputs)
  File "/home/ubuntu/my_repo/venv/lib/python3.8/site-packages/torch_neuron/ops/aten.py", line 472, in index
    output = tf_index_multidim_gather_nd(tensor, index_lists)
  File "/home/ubuntu/my_repo/venv/lib/python3.8/site-packages/torch_neuron/ops/aten.py", line 392, in tf_index_multidim_gather_nd
    raise NotImplementedError(
NotImplementedError: Don't support multiple lists of indices in a row in the beginning

The compilation works fine if I change to dynamic_batch_size=False or fallback=True but then the error is raised at inference time, saying that Embedding layer collides with dynamic_batch_size.

Do you have any solution to this? I can't find nothing similar in previous issues

Hi @fabiozappo,

This is an issue with the graph partitioner being unable to track how a scalar value should be partitioned. This occurs because certain operations which produce scalar values are partitioned to CPU. The dynamic batching logic then inspects every input tensor on dim=0 and only slices the batch up if all of the dimensions are equal in size. If a scalar is encountered, then it violates the dim=0 constraint and the dynamic batching fails.

One option is to move the dynamic batching outside of the partitioned graph in order to avoid partitioning issues. The dynamic batching logic inside the Neuron runtime is similar to the following application-level dynamic batching but has some optimizations to allow concurrent transfers of batch chunks from CPU to Neuron hardware. This optimization should be relatively minor though, so torchscript dynamic batching should still be relatively performant and allow for dynamic inference.

import torch
import torch_neuron
from transformers import CLIPModel
import torch.nn.functional as F

# Torch wrapper to isolate the text tower
class ClipTextEncoder(torch.nn.Module):
    def __init__(self, model):
        super(ClipTextEncoder, self).__init__()
        self.model = model

    def forward(self, x: torch.Tensor, y: torch.Tensor):
        x = self.model.get_text_features(input_ids=x, attention_mask=y)
        return x

class DynamicBatcher(torch.nn.Module):

    def __init__(self, model, batch_size):
        super().__init__()
        self.model = model
        self.batch_size = batch_size

    def forward(self, x: torch.Tensor, y: torch.Tensor):
        results = list()
        for x, y in zip(x.split(self.batch_size), y.split(self.batch_size)):
            chunk_size = x.size(0)
            x = F.pad(x, (0, 0, 0, self.batch_size - chunk_size))
            y = F.pad(y, (0, 0, 0, self.batch_size - chunk_size))
            result = self.model(x, y)
            result = result[:chunk_size]
            results.append(result)
        return torch.cat(results, dim=0)

def export_to_neuron(dummy_input, model, batch_size=1):
    if not isinstance(dummy_input, list):
        dummy_input = [dummy_input]

    torch.neuron.analyze_model(model, example_inputs=dummy_input)
    model_neuron = torch.neuron.trace(model, example_inputs=dummy_input, separate_weights=True, dynamic_batch_size=False, fallback=True)

    # Wrap model with dynamic batching logic and convert to torchscript
    model_neuron = DynamicBatcher(model_neuron, batch_size=batch_size)
    model_neuron = torch.jit.script(model_neuron)
    return model_neuron

if __name__ == "__main__":

    # Load from pretrained clip
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

    # Input information
    text_num_tokens = 77
    batch_size = 1
    device = "cpu"

    # create our model to convert
    text_encoder = ClipTextEncoder(model)

    # Lets define inputs. Torch model expect to see torch tensors as input
    dummy_text = torch.randint(low=0, high=512, size=(batch_size, text_num_tokens)).to(device)
    dummy_mask = torch.randint(low=0, high=2, size=(dummy_text).size()).to(device)

    # # Converting to neuron the two encoders
    neuron_text_encoder = export_to_neuron([dummy_text, dummy_mask], text_encoder, batch_size=batch_size)

    dummy_text = torch.randint(low=0, high=512, size=(1, text_num_tokens)).to(device)
    dummy_mask = torch.randint(low=0, high=2, size=(dummy_text).size()).to(device)
    text_embeddings = neuron_text_encoder(dummy_text, dummy_mask)
    print(f"Computing text embeddings of shape {text_embeddings.shape[1]} with neuron, batch size {text_embeddings.shape[0]}")

    dummy_text = torch.randint(low=0, high=512, size=(2, text_num_tokens)).to(device)
    dummy_mask = torch.randint(low=0, high=2, size=(dummy_text).size()).to(device)
    text_embeddings = neuron_text_encoder(dummy_text, dummy_mask)
    print(f"Computing text embeddings of shape {text_embeddings.shape[1]} with neuron, batch size {text_embeddings.shape[0]}")

It's not a perfect generalized solution but should have very similar performance characteristics compared to the built-in dynamic batching.

aws-neuron / aws-neuron-sdk

Dynamic batching in inference doesn't work when embedding layers are included and input is two tensors #862