Hello. I'm trying to convert PyTorch model to Stateful CoreML Model
I wrote this code referred to WWDC 2024 session Mistral-7B model
The CoreML file is appear after run, but "Failed to build the model execution plan using a model architecture file" error appears when CoreML Class init
Stack Trace
/opt/homebrew/lib/python3.11/site-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead
warnings.warn(
/opt/homebrew/lib/python3.11/site-packages/torch/jit/_trace.py:1116: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error:
Tensor-likes are not close!
Mismatched elements: 12 / 90000 (0.0%)
Greatest absolute difference: 1.6361474990844727e-05 at index (0, 11, 1251) (up to 1e-05 allowed)
Greatest relative difference: 0.000991315116234805 at index (0, 12, 1660) (up to 1e-05 allowed)
_check_trace(
Torch var valueCache is added again.
Torch var keyCache is added again.
Converting PyTorch Frontend ==> MIL Ops: 100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 1516/1516 [00:00<00:00, 2464.91 ops/s]
Running MIL frontend_pytorch pipeline: 100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 5/5 [00:00<00:00, 27.54 passes/s]
Running MIL default pipeline: 60%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | 52/86 [00:02<00:01, 27.17 passes/s]/opt/homebrew/lib/python3.11/site-packages/coremltools/converters/mil/mil/ops/defs/iOS15/elementwise_unary.py:894: RuntimeWarning: overflow encountered in cast
return input_var.val.astype(dtype=string_to_nptype(dtype_val))
Running MIL default pipeline: 100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 86/86 [00:06<00:00, 13.08 passes/s]
Running MIL backend_mlprogram pipeline: 100%|โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 12/12 [00:00<00:00, 64.39 passes/s]
/opt/homebrew/lib/python3.11/site-packages/coremltools/models/model.py:441: RuntimeWarning: You will not be able to run predict() on this Core ML model. Underlying exception message was: {
NSLocalizedDescription = "Failed to build the model execution plan using a model architecture file '/private/var/folders/pz/rmstwmls5ls_0hrn5_jj01kh0000gn/T/tmplybl8sp_.mlmodelc/model.mil' with error code: 14.";
}
_warnings.warn(
Model successfully converted and saved as: zenz_v1_cached.mlpackage
To Reproduce
import torch
from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel, GPT2Attention, GPT2_ATTENTION_CLASSES
from transformers import AutoTokenizer
import coremltools as ct
from typing import Optional, Tuple
import numpy as np
from transformers.cache_utils import Cache
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
class SliceUpdateKeyValueCache(Cache):
def __init__(self, shape: Tuple[int, ...], device="cpu", dtype=torch.float32) -> None:
"""KV cache of shape (#layers, batch_size, #kv_heads, context_size, head_dim)."""
super().__init__()
self.past_seen_tokens: int = 0
self.k_cache: torch.Tensor = torch.zeros(shape, dtype=dtype, device=device)
self.v_cache: torch.Tensor = torch.zeros(shape, dtype=dtype, device=device)
def update(self, k_state: torch.Tensor, v_state: torch.Tensor, layer_idx: int, slice_indices: torch.LongTensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""Update key/value cache tensors for slice [slice_indices[0], slice_indices[1])."""
if len(slice_indices) != 2:
raise ValueError(f"Expect tuple of integers [start, end), got {slice_indices=}.")
begin, end = slice_indices
self.k_cache[layer_idx, :, : k_state.shape[1], begin:end, :] = k_state
self.v_cache[layer_idx, :, : v_state.shape[1], begin:end, :] = v_state
return self.k_cache[layer_idx, :, :, :end, :], self.v_cache[layer_idx, :, :, :end, :]
def get_seq_length(self, _: int = 0) -> int:
"""Get the sequence length of the cache."""
return self.past_seen_tokens
def to_past_key_values(self):
"""Convert the internal cache to a format expected by GPT2."""
return [(self.k_cache[layer], self.v_cache[layer]) for layer in range(self.k_cache.size(0))]
class SliceUpdateGPT2Attention(GPT2Attention):
def __init__(self, config, layer_idx: Optional[int] = None):
super().__init__(config=config, layer_idx=layer_idx)
@torch.no_grad()
def forward(self, hidden_states: torch.Tensor,
layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
use_cache: bool = False) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
# Compute query, key, and value tensors
query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
query = self._split_heads(query, self.num_heads, self.head_dim)
key = self._split_heads(key, self.num_heads, self.head_dim)
value = self._split_heads(value, self.num_heads, self.head_dim)
# Handle past key/value tensors using tensor-based condition
if layer_past is not None:
past_key, past_value = layer_past
if past_key.size(-2) > 0:
key = torch.cat([past_key, key], dim=-2)
value = torch.cat([past_value, value], dim=-2)
# Optimize attention mask handling
if attention_mask is not None:
attention_mask = attention_mask[:, :, :, -key.size(-2):]
# Calculate attention output
attn_output, _ = self._attn(query, key, value, attention_mask, head_mask)
attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
attn_output = self.c_proj(attn_output)
# Return the updated cache if use_cache is True
present = (key, value) if use_cache else None
return attn_output, present
# Load the model and tokenizer
model_name = "Miwa-Keita/zenz-v1-checkpoints"
GPT2_ATTENTION_CLASSES["sdpa"] = SliceUpdateGPT2Attention
model = GPT2LMHeadModel.from_pretrained(model_name).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Prepare input data
text = "Example sentence"
inputs = tokenizer(text, return_tensors="pt")
# Model tracing
class StatefulZenz(torch.nn.Module):
def __init__(self, model, max_context_size: int = 256, batch_size: int = 1):
super(StatefulZenz, self).__init__()
self.model = model
config = self.model.config
self.kv_cache_shape: Tuple[int, ...] = (
config.num_hidden_layers,
batch_size,
config.n_head,
max_context_size,
config.hidden_size // config.num_attention_heads,
)
self.kv_cache = SliceUpdateKeyValueCache(shape=self.kv_cache_shape)
self.register_buffer("keyCache", self.kv_cache.k_cache)
self.register_buffer("valueCache", self.kv_cache.v_cache)
@torch.no_grad()
def forward(self, input_ids, attention_mask):
self.kv_cache.past_seen_tokens = attention_mask.shape[-1] - input_ids.shape[-1]
past_key_values = self.kv_cache.to_past_key_values()
# Reintroduce the attention mask extension logic
attention_mask = self._extend_attention_mask(attention_mask, past_key_values)
outputs = self.model(input_ids, attention_mask=attention_mask, past_key_values=past_key_values, use_cache=True)
return outputs.logits
def _extend_attention_mask(self, attention_mask, past_key_values):
"""Adjust the attention mask to match the size of the key/value cache."""
if past_key_values is not None:
past_length = past_key_values[0][0].size(-2)
new_length = past_length + attention_mask.size(-1)
extended_attention_mask = torch.ones(
(attention_mask.size(0), 1, 1, new_length), dtype=attention_mask.dtype, device=attention_mask.device
)
extended_attention_mask[:, :, :, -attention_mask.size(-1):] = attention_mask
else:
extended_attention_mask = attention_mask
return extended_attention_mask
# Create the traced model
stateful_zenz = StatefulZenz(model).eval()
traced_model = torch.jit.trace(stateful_zenz, (inputs['input_ids'], inputs['attention_mask']))
# Convert the model to CoreML
mlmodel = ct.convert(
traced_model,
inputs=[
ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 256))), # ไธ้ใ256ใซ่จญๅฎ
ct.TensorType(name="attention_mask", shape=(1, ct.RangeDim(1, 256))) # ไธ้ใ256ใซ่จญๅฎ
],
outputs=[
ct.TensorType(dtype=np.float32, name="output")
],
states=[
ct.StateType(
wrapped_type=ct.TensorType(
shape=stateful_zenz.kv_cache_shape
),
name="keyCache",
),
ct.StateType(
wrapped_type=ct.TensorType(
shape=stateful_zenz.kv_cache_shape
),
name="valueCache",
),
],
minimum_deployment_target=ct.target.iOS18
)
# Save the converted model
mlmodel.save("zenz_v1_cached.mlpackage")
print("Model successfully converted and saved as: zenz_v1_cached.mlpackage")
System environment (please complete the following information):
coremltools version: 8.0b2
OS (e.g. MacOS version or Linux type): Mac OS Version 15.1 Beta (24B5024e)
Any other relevant version information (e.g. PyTorch or TensorFlow version):
๐Describing the bug
Hello. I'm trying to convert PyTorch model to Stateful CoreML Model
I wrote this code referred to WWDC 2024 session Mistral-7B model The CoreML file is appear after run, but "Failed to build the model execution plan using a model architecture file" error appears when CoreML Class init
Stack Trace
To Reproduce
System environment (please complete the following information):