Closed pommedeterresautee closed 1 year ago
Ok, FWIW, the minifier generated 3 files, 2 of them raise the crash (the last one copied in repro.py does not). @desertfire @eellison do you want me to open a second issue regarding the minifier?
Here is the code:
from math import inf
import torch
from torch import tensor, device
import torch.fx as fx
import torch._dynamo
from torch._dynamo.testing import rand_strided
from torch._dynamo.debug_utils import run_fwd_maybe_bwd
from torch._dynamo.debug_utils import same_two_models
# REPLACEABLE COMMENT FOR TESTING PURPOSES
args = [((2, 1), (1, 1), torch.int64, 'cuda', False), ((2, 1500, 384), (576000, 384, 1), torch.float32, 'cuda', False), ((2, 1, 1536), (1536, 1536, 1), torch.float16, 'cuda', True)]
args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
from torch.nn import *
class Repro(torch.nn.Module):
def __init__(self):
super().__init__()
self.self_self_embed_tokens = Embedding(51865, 384, padding_idx=50257).cuda()
self.self_self_layers_0_self_attn_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_0_self_attn_q_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_0_self_attn_k_proj = Linear(in_features=384, out_features=384, bias=False).cuda()
self.self_self_layers_0_self_attn_v_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_0_self_attn_out_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_0_encoder_attn_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_0_encoder_attn_q_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_0_encoder_attn_k_proj = Linear(in_features=384, out_features=384, bias=False).cuda()
self.self_self_layers_0_encoder_attn_v_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_0_encoder_attn_out_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_0_final_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_0_fc1 = Linear(in_features=384, out_features=1536, bias=True).cuda()
self.self_self_layers_0_fc2 = Linear(in_features=1536, out_features=384, bias=True).cuda()
self.self_self_layers_1_self_attn_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_1_self_attn_q_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_1_self_attn_k_proj = Linear(in_features=384, out_features=384, bias=False).cuda()
self.self_self_layers_1_self_attn_v_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_1_self_attn_out_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_1_encoder_attn_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_1_encoder_attn_q_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_1_encoder_attn_k_proj = Linear(in_features=384, out_features=384, bias=False).cuda()
self.self_self_layers_1_encoder_attn_v_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_1_encoder_attn_out_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_1_final_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_1_fc1 = Linear(in_features=384, out_features=1536, bias=True).cuda()
self.self_self_layers_1_fc2 = Linear(in_features=1536, out_features=384, bias=True).cuda()
self.self_self_layers_2_self_attn_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_2_self_attn_q_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_2_self_attn_k_proj = Linear(in_features=384, out_features=384, bias=False).cuda()
self.self_self_layers_2_self_attn_v_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_2_self_attn_out_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_2_encoder_attn_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_2_encoder_attn_q_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_2_encoder_attn_k_proj = Linear(in_features=384, out_features=384, bias=False).cuda()
self.self_self_layers_2_encoder_attn_v_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_2_encoder_attn_out_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_2_final_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_2_fc1 = Linear(in_features=384, out_features=1536, bias=True).cuda()
self.self_self_layers_2_fc2 = Linear(in_features=1536, out_features=384, bias=True).cuda()
self.self_self_layers_3_self_attn_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_3_self_attn_q_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_3_self_attn_k_proj = Linear(in_features=384, out_features=384, bias=False).cuda()
self.self_self_layers_3_self_attn_v_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_3_self_attn_out_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_3_encoder_attn_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_3_encoder_attn_q_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_3_encoder_attn_k_proj = Linear(in_features=384, out_features=384, bias=False).cuda()
self.self_self_layers_3_encoder_attn_v_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_3_encoder_attn_out_proj = Linear(in_features=384, out_features=384, bias=True).cuda()
self.self_self_layers_3_final_layer_norm = LayerNorm((384,), eps=1e-05, elementwise_affine=True).cuda()
self.self_self_layers_3_fc1 = Linear(in_features=384, out_features=1536, bias=True).cuda()
self.register_buffer('self_self_embed_positions_weight', torch.randn([448, 384], dtype=torch.float32).cuda())
def forward(self, input_ids : torch.Tensor, encoder_hidden_states : torch.Tensor, dropout_23):
view = input_ids.view(-1, 1); input_ids = None
self_self_embed_tokens = self.self_self_embed_tokens(view); view = None
self_self_embed_positions_weight = self.self_self_embed_positions_weight
getitem = self_self_embed_positions_weight[slice(0, 1, None)]; self_self_embed_positions_weight = None
add = self_self_embed_tokens + getitem; self_self_embed_tokens = getitem = None
dropout = torch.nn.functional.dropout(add, p = 0.0, training = False); add = None
self_self_layers_0_self_attn_layer_norm = self.self_self_layers_0_self_attn_layer_norm(dropout)
self_self_layers_0_self_attn_q_proj = self.self_self_layers_0_self_attn_q_proj(self_self_layers_0_self_attn_layer_norm)
mul = self_self_layers_0_self_attn_q_proj * 0.125; self_self_layers_0_self_attn_q_proj = None
self_self_layers_0_self_attn_k_proj = self.self_self_layers_0_self_attn_k_proj(self_self_layers_0_self_attn_layer_norm)
view_1 = self_self_layers_0_self_attn_k_proj.view(2, -1, 6, 64); self_self_layers_0_self_attn_k_proj = None
transpose = view_1.transpose(1, 2); view_1 = None
contiguous = transpose.contiguous(); transpose = None
self_self_layers_0_self_attn_v_proj = self.self_self_layers_0_self_attn_v_proj(self_self_layers_0_self_attn_layer_norm); self_self_layers_0_self_attn_layer_norm = None
view_2 = self_self_layers_0_self_attn_v_proj.view(2, -1, 6, 64); self_self_layers_0_self_attn_v_proj = None
transpose_1 = view_2.transpose(1, 2); view_2 = None
contiguous_1 = transpose_1.contiguous(); transpose_1 = None
view_3 = mul.view(2, 1, 6, 64); mul = None
transpose_2 = view_3.transpose(1, 2); view_3 = None
contiguous_2 = transpose_2.contiguous(); transpose_2 = None
view_4 = contiguous_2.view(12, -1, 64); contiguous_2 = None
view_5 = contiguous.view(12, -1, 64); contiguous = None
view_6 = contiguous_1.view(12, -1, 64); contiguous_1 = None
transpose_3 = view_5.transpose(1, 2); view_5 = None
bmm = torch.bmm(view_4, transpose_3); view_4 = transpose_3 = None
softmax = torch.nn.functional.softmax(bmm, dim = -1); bmm = None
dropout_1 = torch.nn.functional.dropout(softmax, p = 0.0, training = False); softmax = None
bmm_1 = torch.bmm(dropout_1, view_6); dropout_1 = view_6 = None
view_7 = bmm_1.view(2, 6, 1, 64); bmm_1 = None
transpose_4 = view_7.transpose(1, 2); view_7 = None
reshape = transpose_4.reshape(2, 1, 384); transpose_4 = None
self_self_layers_0_self_attn_out_proj = self.self_self_layers_0_self_attn_out_proj(reshape); reshape = None
dropout_2 = torch.nn.functional.dropout(self_self_layers_0_self_attn_out_proj, p = 0.0, training = False); self_self_layers_0_self_attn_out_proj = None
add_1 = dropout + dropout_2; dropout = dropout_2 = None
self_self_layers_0_encoder_attn_layer_norm = self.self_self_layers_0_encoder_attn_layer_norm(add_1)
self_self_layers_0_encoder_attn_q_proj = self.self_self_layers_0_encoder_attn_q_proj(self_self_layers_0_encoder_attn_layer_norm); self_self_layers_0_encoder_attn_layer_norm = None
mul_1 = self_self_layers_0_encoder_attn_q_proj * 0.125; self_self_layers_0_encoder_attn_q_proj = None
self_self_layers_0_encoder_attn_k_proj = self.self_self_layers_0_encoder_attn_k_proj(encoder_hidden_states)
view_8 = self_self_layers_0_encoder_attn_k_proj.view(2, -1, 6, 64); self_self_layers_0_encoder_attn_k_proj = None
transpose_5 = view_8.transpose(1, 2); view_8 = None
contiguous_3 = transpose_5.contiguous(); transpose_5 = None
self_self_layers_0_encoder_attn_v_proj = self.self_self_layers_0_encoder_attn_v_proj(encoder_hidden_states)
view_9 = self_self_layers_0_encoder_attn_v_proj.view(2, -1, 6, 64); self_self_layers_0_encoder_attn_v_proj = None
transpose_6 = view_9.transpose(1, 2); view_9 = None
contiguous_4 = transpose_6.contiguous(); transpose_6 = None
view_10 = mul_1.view(2, 1, 6, 64); mul_1 = None
transpose_7 = view_10.transpose(1, 2); view_10 = None
contiguous_5 = transpose_7.contiguous(); transpose_7 = None
view_11 = contiguous_5.view(12, -1, 64); contiguous_5 = None
view_12 = contiguous_3.view(12, -1, 64); contiguous_3 = None
view_13 = contiguous_4.view(12, -1, 64); contiguous_4 = None
transpose_8 = view_12.transpose(1, 2); view_12 = None
bmm_2 = torch.bmm(view_11, transpose_8); view_11 = transpose_8 = None
softmax_1 = torch.nn.functional.softmax(bmm_2, dim = -1); bmm_2 = None
dropout_3 = torch.nn.functional.dropout(softmax_1, p = 0.0, training = False); softmax_1 = None
bmm_3 = torch.bmm(dropout_3, view_13); dropout_3 = view_13 = None
view_14 = bmm_3.view(2, 6, 1, 64); bmm_3 = None
transpose_9 = view_14.transpose(1, 2); view_14 = None
reshape_1 = transpose_9.reshape(2, 1, 384); transpose_9 = None
self_self_layers_0_encoder_attn_out_proj = self.self_self_layers_0_encoder_attn_out_proj(reshape_1); reshape_1 = None
dropout_4 = torch.nn.functional.dropout(self_self_layers_0_encoder_attn_out_proj, p = 0.0, training = False); self_self_layers_0_encoder_attn_out_proj = None
add_2 = add_1 + dropout_4; add_1 = dropout_4 = None
self_self_layers_0_final_layer_norm = self.self_self_layers_0_final_layer_norm(add_2)
self_self_layers_0_fc1 = self.self_self_layers_0_fc1(self_self_layers_0_final_layer_norm); self_self_layers_0_final_layer_norm = None
gelu = torch._C._nn.gelu(self_self_layers_0_fc1); self_self_layers_0_fc1 = None
dropout_5 = torch.nn.functional.dropout(gelu, p = 0.0, training = False); gelu = None
self_self_layers_0_fc2 = self.self_self_layers_0_fc2(dropout_5); dropout_5 = None
dropout_6 = torch.nn.functional.dropout(self_self_layers_0_fc2, p = 0.0, training = False); self_self_layers_0_fc2 = None
add_3 = add_2 + dropout_6; add_2 = dropout_6 = None
self_self_layers_1_self_attn_layer_norm = self.self_self_layers_1_self_attn_layer_norm(add_3)
self_self_layers_1_self_attn_q_proj = self.self_self_layers_1_self_attn_q_proj(self_self_layers_1_self_attn_layer_norm)
mul_2 = self_self_layers_1_self_attn_q_proj * 0.125; self_self_layers_1_self_attn_q_proj = None
self_self_layers_1_self_attn_k_proj = self.self_self_layers_1_self_attn_k_proj(self_self_layers_1_self_attn_layer_norm)
view_15 = self_self_layers_1_self_attn_k_proj.view(2, -1, 6, 64); self_self_layers_1_self_attn_k_proj = None
transpose_10 = view_15.transpose(1, 2); view_15 = None
contiguous_6 = transpose_10.contiguous(); transpose_10 = None
self_self_layers_1_self_attn_v_proj = self.self_self_layers_1_self_attn_v_proj(self_self_layers_1_self_attn_layer_norm); self_self_layers_1_self_attn_layer_norm = None
view_16 = self_self_layers_1_self_attn_v_proj.view(2, -1, 6, 64); self_self_layers_1_self_attn_v_proj = None
transpose_11 = view_16.transpose(1, 2); view_16 = None
contiguous_7 = transpose_11.contiguous(); transpose_11 = None
view_17 = mul_2.view(2, 1, 6, 64); mul_2 = None
transpose_12 = view_17.transpose(1, 2); view_17 = None
contiguous_8 = transpose_12.contiguous(); transpose_12 = None
view_18 = contiguous_8.view(12, -1, 64); contiguous_8 = None
view_19 = contiguous_6.view(12, -1, 64); contiguous_6 = None
view_20 = contiguous_7.view(12, -1, 64); contiguous_7 = None
transpose_13 = view_19.transpose(1, 2); view_19 = None
bmm_4 = torch.bmm(view_18, transpose_13); view_18 = transpose_13 = None
softmax_2 = torch.nn.functional.softmax(bmm_4, dim = -1); bmm_4 = None
dropout_7 = torch.nn.functional.dropout(softmax_2, p = 0.0, training = False); softmax_2 = None
bmm_5 = torch.bmm(dropout_7, view_20); dropout_7 = view_20 = None
view_21 = bmm_5.view(2, 6, 1, 64); bmm_5 = None
transpose_14 = view_21.transpose(1, 2); view_21 = None
reshape_2 = transpose_14.reshape(2, 1, 384); transpose_14 = None
self_self_layers_1_self_attn_out_proj = self.self_self_layers_1_self_attn_out_proj(reshape_2); reshape_2 = None
dropout_8 = torch.nn.functional.dropout(self_self_layers_1_self_attn_out_proj, p = 0.0, training = False); self_self_layers_1_self_attn_out_proj = None
add_4 = add_3 + dropout_8; add_3 = dropout_8 = None
self_self_layers_1_encoder_attn_layer_norm = self.self_self_layers_1_encoder_attn_layer_norm(add_4)
self_self_layers_1_encoder_attn_q_proj = self.self_self_layers_1_encoder_attn_q_proj(self_self_layers_1_encoder_attn_layer_norm); self_self_layers_1_encoder_attn_layer_norm = None
mul_3 = self_self_layers_1_encoder_attn_q_proj * 0.125; self_self_layers_1_encoder_attn_q_proj = None
self_self_layers_1_encoder_attn_k_proj = self.self_self_layers_1_encoder_attn_k_proj(encoder_hidden_states)
view_22 = self_self_layers_1_encoder_attn_k_proj.view(2, -1, 6, 64); self_self_layers_1_encoder_attn_k_proj = None
transpose_15 = view_22.transpose(1, 2); view_22 = None
contiguous_9 = transpose_15.contiguous(); transpose_15 = None
self_self_layers_1_encoder_attn_v_proj = self.self_self_layers_1_encoder_attn_v_proj(encoder_hidden_states)
view_23 = self_self_layers_1_encoder_attn_v_proj.view(2, -1, 6, 64); self_self_layers_1_encoder_attn_v_proj = None
transpose_16 = view_23.transpose(1, 2); view_23 = None
contiguous_10 = transpose_16.contiguous(); transpose_16 = None
view_24 = mul_3.view(2, 1, 6, 64); mul_3 = None
transpose_17 = view_24.transpose(1, 2); view_24 = None
contiguous_11 = transpose_17.contiguous(); transpose_17 = None
view_25 = contiguous_11.view(12, -1, 64); contiguous_11 = None
view_26 = contiguous_9.view(12, -1, 64); contiguous_9 = None
view_27 = contiguous_10.view(12, -1, 64); contiguous_10 = None
transpose_18 = view_26.transpose(1, 2); view_26 = None
bmm_6 = torch.bmm(view_25, transpose_18); view_25 = transpose_18 = None
softmax_3 = torch.nn.functional.softmax(bmm_6, dim = -1); bmm_6 = None
dropout_9 = torch.nn.functional.dropout(softmax_3, p = 0.0, training = False); softmax_3 = None
bmm_7 = torch.bmm(dropout_9, view_27); dropout_9 = view_27 = None
view_28 = bmm_7.view(2, 6, 1, 64); bmm_7 = None
transpose_19 = view_28.transpose(1, 2); view_28 = None
reshape_3 = transpose_19.reshape(2, 1, 384); transpose_19 = None
self_self_layers_1_encoder_attn_out_proj = self.self_self_layers_1_encoder_attn_out_proj(reshape_3); reshape_3 = None
dropout_10 = torch.nn.functional.dropout(self_self_layers_1_encoder_attn_out_proj, p = 0.0, training = False); self_self_layers_1_encoder_attn_out_proj = None
add_5 = add_4 + dropout_10; add_4 = dropout_10 = None
self_self_layers_1_final_layer_norm = self.self_self_layers_1_final_layer_norm(add_5)
self_self_layers_1_fc1 = self.self_self_layers_1_fc1(self_self_layers_1_final_layer_norm); self_self_layers_1_final_layer_norm = None
gelu_1 = torch._C._nn.gelu(self_self_layers_1_fc1); self_self_layers_1_fc1 = None
dropout_11 = torch.nn.functional.dropout(gelu_1, p = 0.0, training = False); gelu_1 = None
self_self_layers_1_fc2 = self.self_self_layers_1_fc2(dropout_11); dropout_11 = None
dropout_12 = torch.nn.functional.dropout(self_self_layers_1_fc2, p = 0.0, training = False); self_self_layers_1_fc2 = None
add_6 = add_5 + dropout_12; add_5 = dropout_12 = None
self_self_layers_2_self_attn_layer_norm = self.self_self_layers_2_self_attn_layer_norm(add_6)
self_self_layers_2_self_attn_q_proj = self.self_self_layers_2_self_attn_q_proj(self_self_layers_2_self_attn_layer_norm)
mul_4 = self_self_layers_2_self_attn_q_proj * 0.125; self_self_layers_2_self_attn_q_proj = None
self_self_layers_2_self_attn_k_proj = self.self_self_layers_2_self_attn_k_proj(self_self_layers_2_self_attn_layer_norm)
view_29 = self_self_layers_2_self_attn_k_proj.view(2, -1, 6, 64); self_self_layers_2_self_attn_k_proj = None
transpose_20 = view_29.transpose(1, 2); view_29 = None
contiguous_12 = transpose_20.contiguous(); transpose_20 = None
self_self_layers_2_self_attn_v_proj = self.self_self_layers_2_self_attn_v_proj(self_self_layers_2_self_attn_layer_norm); self_self_layers_2_self_attn_layer_norm = None
view_30 = self_self_layers_2_self_attn_v_proj.view(2, -1, 6, 64); self_self_layers_2_self_attn_v_proj = None
transpose_21 = view_30.transpose(1, 2); view_30 = None
contiguous_13 = transpose_21.contiguous(); transpose_21 = None
view_31 = mul_4.view(2, 1, 6, 64); mul_4 = None
transpose_22 = view_31.transpose(1, 2); view_31 = None
contiguous_14 = transpose_22.contiguous(); transpose_22 = None
view_32 = contiguous_14.view(12, -1, 64); contiguous_14 = None
view_33 = contiguous_12.view(12, -1, 64); contiguous_12 = None
view_34 = contiguous_13.view(12, -1, 64); contiguous_13 = None
transpose_23 = view_33.transpose(1, 2); view_33 = None
bmm_8 = torch.bmm(view_32, transpose_23); view_32 = transpose_23 = None
softmax_4 = torch.nn.functional.softmax(bmm_8, dim = -1); bmm_8 = None
dropout_13 = torch.nn.functional.dropout(softmax_4, p = 0.0, training = False); softmax_4 = None
bmm_9 = torch.bmm(dropout_13, view_34); dropout_13 = view_34 = None
view_35 = bmm_9.view(2, 6, 1, 64); bmm_9 = None
transpose_24 = view_35.transpose(1, 2); view_35 = None
reshape_4 = transpose_24.reshape(2, 1, 384); transpose_24 = None
self_self_layers_2_self_attn_out_proj = self.self_self_layers_2_self_attn_out_proj(reshape_4); reshape_4 = None
dropout_14 = torch.nn.functional.dropout(self_self_layers_2_self_attn_out_proj, p = 0.0, training = False); self_self_layers_2_self_attn_out_proj = None
add_7 = add_6 + dropout_14; add_6 = dropout_14 = None
self_self_layers_2_encoder_attn_layer_norm = self.self_self_layers_2_encoder_attn_layer_norm(add_7)
self_self_layers_2_encoder_attn_q_proj = self.self_self_layers_2_encoder_attn_q_proj(self_self_layers_2_encoder_attn_layer_norm); self_self_layers_2_encoder_attn_layer_norm = None
mul_5 = self_self_layers_2_encoder_attn_q_proj * 0.125; self_self_layers_2_encoder_attn_q_proj = None
self_self_layers_2_encoder_attn_k_proj = self.self_self_layers_2_encoder_attn_k_proj(encoder_hidden_states)
view_36 = self_self_layers_2_encoder_attn_k_proj.view(2, -1, 6, 64); self_self_layers_2_encoder_attn_k_proj = None
transpose_25 = view_36.transpose(1, 2); view_36 = None
contiguous_15 = transpose_25.contiguous(); transpose_25 = None
self_self_layers_2_encoder_attn_v_proj = self.self_self_layers_2_encoder_attn_v_proj(encoder_hidden_states)
view_37 = self_self_layers_2_encoder_attn_v_proj.view(2, -1, 6, 64); self_self_layers_2_encoder_attn_v_proj = None
transpose_26 = view_37.transpose(1, 2); view_37 = None
contiguous_16 = transpose_26.contiguous(); transpose_26 = None
view_38 = mul_5.view(2, 1, 6, 64); mul_5 = None
transpose_27 = view_38.transpose(1, 2); view_38 = None
contiguous_17 = transpose_27.contiguous(); transpose_27 = None
view_39 = contiguous_17.view(12, -1, 64); contiguous_17 = None
view_40 = contiguous_15.view(12, -1, 64); contiguous_15 = None
view_41 = contiguous_16.view(12, -1, 64); contiguous_16 = None
transpose_28 = view_40.transpose(1, 2); view_40 = None
bmm_10 = torch.bmm(view_39, transpose_28); view_39 = transpose_28 = None
softmax_5 = torch.nn.functional.softmax(bmm_10, dim = -1); bmm_10 = None
dropout_15 = torch.nn.functional.dropout(softmax_5, p = 0.0, training = False); softmax_5 = None
bmm_11 = torch.bmm(dropout_15, view_41); dropout_15 = view_41 = None
view_42 = bmm_11.view(2, 6, 1, 64); bmm_11 = None
transpose_29 = view_42.transpose(1, 2); view_42 = None
reshape_5 = transpose_29.reshape(2, 1, 384); transpose_29 = None
self_self_layers_2_encoder_attn_out_proj = self.self_self_layers_2_encoder_attn_out_proj(reshape_5); reshape_5 = None
dropout_16 = torch.nn.functional.dropout(self_self_layers_2_encoder_attn_out_proj, p = 0.0, training = False); self_self_layers_2_encoder_attn_out_proj = None
add_8 = add_7 + dropout_16; add_7 = dropout_16 = None
self_self_layers_2_final_layer_norm = self.self_self_layers_2_final_layer_norm(add_8)
self_self_layers_2_fc1 = self.self_self_layers_2_fc1(self_self_layers_2_final_layer_norm); self_self_layers_2_final_layer_norm = None
gelu_2 = torch._C._nn.gelu(self_self_layers_2_fc1); self_self_layers_2_fc1 = None
dropout_17 = torch.nn.functional.dropout(gelu_2, p = 0.0, training = False); gelu_2 = None
self_self_layers_2_fc2 = self.self_self_layers_2_fc2(dropout_17); dropout_17 = None
dropout_18 = torch.nn.functional.dropout(self_self_layers_2_fc2, p = 0.0, training = False); self_self_layers_2_fc2 = None
add_9 = add_8 + dropout_18; add_8 = dropout_18 = None
self_self_layers_3_self_attn_layer_norm = self.self_self_layers_3_self_attn_layer_norm(add_9)
self_self_layers_3_self_attn_q_proj = self.self_self_layers_3_self_attn_q_proj(self_self_layers_3_self_attn_layer_norm)
mul_6 = self_self_layers_3_self_attn_q_proj * 0.125; self_self_layers_3_self_attn_q_proj = None
self_self_layers_3_self_attn_k_proj = self.self_self_layers_3_self_attn_k_proj(self_self_layers_3_self_attn_layer_norm)
view_43 = self_self_layers_3_self_attn_k_proj.view(2, -1, 6, 64); self_self_layers_3_self_attn_k_proj = None
transpose_30 = view_43.transpose(1, 2); view_43 = None
contiguous_18 = transpose_30.contiguous(); transpose_30 = None
self_self_layers_3_self_attn_v_proj = self.self_self_layers_3_self_attn_v_proj(self_self_layers_3_self_attn_layer_norm); self_self_layers_3_self_attn_layer_norm = None
view_44 = self_self_layers_3_self_attn_v_proj.view(2, -1, 6, 64); self_self_layers_3_self_attn_v_proj = None
transpose_31 = view_44.transpose(1, 2); view_44 = None
contiguous_19 = transpose_31.contiguous(); transpose_31 = None
view_45 = mul_6.view(2, 1, 6, 64); mul_6 = None
transpose_32 = view_45.transpose(1, 2); view_45 = None
contiguous_20 = transpose_32.contiguous(); transpose_32 = None
view_46 = contiguous_20.view(12, -1, 64); contiguous_20 = None
view_47 = contiguous_18.view(12, -1, 64); contiguous_18 = None
view_48 = contiguous_19.view(12, -1, 64); contiguous_19 = None
transpose_33 = view_47.transpose(1, 2); view_47 = None
bmm_12 = torch.bmm(view_46, transpose_33); view_46 = transpose_33 = None
softmax_6 = torch.nn.functional.softmax(bmm_12, dim = -1); bmm_12 = None
dropout_19 = torch.nn.functional.dropout(softmax_6, p = 0.0, training = False); softmax_6 = None
bmm_13 = torch.bmm(dropout_19, view_48); dropout_19 = view_48 = None
view_49 = bmm_13.view(2, 6, 1, 64); bmm_13 = None
transpose_34 = view_49.transpose(1, 2); view_49 = None
reshape_6 = transpose_34.reshape(2, 1, 384); transpose_34 = None
self_self_layers_3_self_attn_out_proj = self.self_self_layers_3_self_attn_out_proj(reshape_6); reshape_6 = None
dropout_20 = torch.nn.functional.dropout(self_self_layers_3_self_attn_out_proj, p = 0.0, training = False); self_self_layers_3_self_attn_out_proj = None
add_10 = add_9 + dropout_20; add_9 = dropout_20 = None
self_self_layers_3_encoder_attn_layer_norm = self.self_self_layers_3_encoder_attn_layer_norm(add_10)
self_self_layers_3_encoder_attn_q_proj = self.self_self_layers_3_encoder_attn_q_proj(self_self_layers_3_encoder_attn_layer_norm); self_self_layers_3_encoder_attn_layer_norm = None
mul_7 = self_self_layers_3_encoder_attn_q_proj * 0.125; self_self_layers_3_encoder_attn_q_proj = None
self_self_layers_3_encoder_attn_k_proj = self.self_self_layers_3_encoder_attn_k_proj(encoder_hidden_states)
view_50 = self_self_layers_3_encoder_attn_k_proj.view(2, -1, 6, 64); self_self_layers_3_encoder_attn_k_proj = None
transpose_35 = view_50.transpose(1, 2); view_50 = None
contiguous_21 = transpose_35.contiguous(); transpose_35 = None
self_self_layers_3_encoder_attn_v_proj = self.self_self_layers_3_encoder_attn_v_proj(encoder_hidden_states); encoder_hidden_states = None
view_51 = self_self_layers_3_encoder_attn_v_proj.view(2, -1, 6, 64); self_self_layers_3_encoder_attn_v_proj = None
transpose_36 = view_51.transpose(1, 2); view_51 = None
contiguous_22 = transpose_36.contiguous(); transpose_36 = None
view_52 = mul_7.view(2, 1, 6, 64); mul_7 = None
transpose_37 = view_52.transpose(1, 2); view_52 = None
contiguous_23 = transpose_37.contiguous(); transpose_37 = None
view_53 = contiguous_23.view(12, -1, 64); contiguous_23 = None
view_54 = contiguous_21.view(12, -1, 64); contiguous_21 = None
view_55 = contiguous_22.view(12, -1, 64); contiguous_22 = None
transpose_38 = view_54.transpose(1, 2); view_54 = None
bmm_14 = torch.bmm(view_53, transpose_38); view_53 = transpose_38 = None
softmax_7 = torch.nn.functional.softmax(bmm_14, dim = -1); bmm_14 = None
dropout_21 = torch.nn.functional.dropout(softmax_7, p = 0.0, training = False); softmax_7 = None
bmm_15 = torch.bmm(dropout_21, view_55); dropout_21 = view_55 = None
view_56 = bmm_15.view(2, 6, 1, 64); bmm_15 = None
transpose_39 = view_56.transpose(1, 2); view_56 = None
reshape_7 = transpose_39.reshape(2, 1, 384); transpose_39 = None
self_self_layers_3_encoder_attn_out_proj = self.self_self_layers_3_encoder_attn_out_proj(reshape_7); reshape_7 = None
dropout_22 = torch.nn.functional.dropout(self_self_layers_3_encoder_attn_out_proj, p = 0.0, training = False); self_self_layers_3_encoder_attn_out_proj = None
add_11 = add_10 + dropout_22; add_10 = dropout_22 = None
self_self_layers_3_final_layer_norm = self.self_self_layers_3_final_layer_norm(add_11); add_11 = None
self_self_layers_3_fc1 = self.self_self_layers_3_fc1(self_self_layers_3_final_layer_norm); self_self_layers_3_final_layer_norm = None
gelu_3 = torch._C._nn.gelu(self_self_layers_3_fc1); self_self_layers_3_fc1 = None
return (dropout_23,)
mod = Repro()
opt_mod = torch._dynamo.optimize("cudagraphs")(mod)
with torch.cuda.amp.autocast(enabled=True):
ref = run_fwd_maybe_bwd(mod, args)
res = run_fwd_maybe_bwd(opt_mod, args)
log:
odes.py
Traceback (most recent call last):
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/output_graph.py", line 637, in call_user_compiler
compiled_fn = compiler_fn(gm, self.fake_example_inputs())
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/debug_utils.py", line 913, in debug_wrapper
compiled_gm = compiler_fn(gm, example_inputs, **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/optimizations/backends.py", line 51, in inner
return inner(SubGraph(model, example_inputs, tmp), **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/optimizations/backends.py", line 56, in inner
return fn(model, **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/optimizations/backends.py", line 457, in cudagraphs
return subgraph.wrap_returns(cudagraphs_inner(model, inputs))
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/optimizations/backends.py", line 495, in cudagraphs_inner
model(*inputs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/fx/graph_module.py", line 660, in call_wrapped
return self._wrapped_call(self, *args, **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/fx/graph_module.py", line 279, in __call__
raise e
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/fx/graph_module.py", line 269, in __call__
return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1482, in _call_impl
return forward_call(*args, **kwargs)
File "<eval_with_key>.1", line 6, in forward
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1482, in _call_impl
return forward_call(*args, **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/nn/modules/sparse.py", line 162, in forward
return F.embedding(
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/nn/functional.py", line 2210, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_subclasses/fake_tensor.py", line 636, in __torch_dispatch__
return func(*args, **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_ops.py", line 285, in __call__
return self._op(*args, **kwargs or {})
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_subclasses/fake_tensor.py", line 812, in __torch_dispatch__
raise Exception(
Exception: Invoking operators with non-Fake Tensor inputs in FakeTensorMode is not yet supported. Please convert all Tensors to FakeTensors first. Found in aten.embedding.default(*(Parameter containing:
tensor([[-1.2773, -0.9429, 0.3348, ..., -1.2888, -0.5345, -1.2228],
[-1.6299, -1.6601, -0.5042, ..., -0.8806, 0.1033, 2.4577],
[ 1.7567, 1.7152, -0.6244, ..., 1.2275, 0.1909, 0.2460],
...,
[-0.3398, 0.2831, -0.4621, ..., -0.5620, -1.1571, -1.9791],
[ 0.2088, -0.7936, -0.3359, ..., -0.5050, -0.2274, -0.0351],
[-1.0065, 0.7551, -1.2023, ..., -0.2943, -0.1852, -0.4113]],
device='cuda:0', requires_grad=True), FakeTensor(FakeTensor(..., device='meta', size=(2, 1), dtype=torch.int64), cuda:0), 50257), **{})
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/mnt/workspace/kernl/checkpoints/minified_258_nodes.py", line 342, in <module>
res = run_fwd_maybe_bwd(opt_mod, args)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/debug_utils.py", line 522, in run_fwd_maybe_bwd
out = gm(args)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_functorch/aot_autograd.py", line 887, in g
return f(*args)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1482, in _call_impl
return forward_call(*args, **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/eval_frame.py", line 82, in forward
return self.dynamo_ctx(self._orig_mod.forward)(*args, **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/eval_frame.py", line 211, in _fn
return fn(*args, **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/eval_frame.py", line 332, in catch_errors
return callback(frame, cache_size, hooks)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/convert_frame.py", line 479, in _convert_frame
result = inner_convert(frame, cache_size, hooks)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/convert_frame.py", line 103, in _fn
return fn(*args, **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/utils.py", line 90, in time_wrapper
r = func(*args, **kwargs)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/convert_frame.py", line 339, in _convert_frame_assert
return _compile(
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/convert_frame.py", line 398, in _compile
out_code = transform_code_object(code, transform)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/bytecode_transformation.py", line 341, in transform_code_object
transformations(instructions, code_options)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/convert_frame.py", line 385, in transform
tracer.run()
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/symbolic_convert.py", line 1686, in run
super().run()
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/symbolic_convert.py", line 537, in run
and self.step()
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/symbolic_convert.py", line 500, in step
getattr(self, inst.opname)(inst)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/symbolic_convert.py", line 1752, in RETURN_VALUE
self.output.compile_subgraph(self)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/output_graph.py", line 514, in compile_subgraph
self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/output_graph.py", line 561, in compile_and_call_fx_graph
compiled_fn = self.call_user_compiler(gm)
File "/home/geantvert/.local/share/virtualenvs/kernl/lib/python3.9/site-packages/torch/_dynamo/output_graph.py", line 642, in call_user_compiler
raise BackendCompilerFailed(self.compiler_fn, e) from e
torch._dynamo.exc.BackendCompilerFailed: cudagraphs raised Exception: Invoking operators with non-Fake Tensor inputs in FakeTensorMode is not yet supported. Please convert all Tensors to FakeTensors first. Found in aten.embedding.default(*(Parameter containing:
tensor([[-1.2773, -0.9429, 0.3348, ..., -1.2888, -0.5345, -1.2228],
[-1.6299, -1.6601, -0.5042, ..., -0.8806, 0.1033, 2.4577],
[ 1.7567, 1.7152, -0.6244, ..., 1.2275, 0.1909, 0.2460],
...,
[-0.3398, 0.2831, -0.4621, ..., -0.5620, -1.1571, -1.9791],
[ 0.2088, -0.7936, -0.3359, ..., -0.5050, -0.2274, -0.0351],
[-1.0065, 0.7551, -1.2023, ..., -0.2943, -0.1852, -0.4113]],
device='cuda:0', requires_grad=True), FakeTensor(FakeTensor(..., device='meta', size=(2, 1), dtype=torch.int64), cuda:0), 50257), **{})
Set torch._dynamo.config.verbose=True for more information
You can suppress this exception and fall back to eager by setting:
torch._dynamo.config.suppress_errors = True
@pommedeterresautee note that this is a backend that just runs cudagraphs instead of inductor with cudagraphs. do you have any reason to use the former instead of the latter ?
Original issue was on a custom compiler leveraging hand made custom triton kernel combined with CUDA Graphs. So not just CUDA Graph.
Above, CUDA Graph PyTorch compiler is just used for reproduction.
I forgot having opened this issue (sorry). The problem is fixed for my custom compiler (adapting code from inductor).
Do you want me to close this issue or keep it open to fix the CG PyTorch compiler?
🐛 Describe the bug
Since a few days, all the nighties crash when CUDA graphs compiler is used. It seems that there is now a default FakeTensor mode and it seems to not like that in CUDA graphs compiler, the warmup is not done with
FakeTensors
.Error logs
Minified repro
CODE OF THE MINIFIED WAS NOT CRASHING, so below is original code.
FWIW, I tried: 1/
env TORCHDYNAMO_REPRO_AFTER="dynamo" python hf_whisper.py
(from https://pytorch.org/docs/master/dynamo/troubleshooting.html#minifying-backend-compiler-errors) 2/python torch_compile_debug/run_2022_12_13_12_41_11_444847/minifier/minifier_launcher.py
(didn't output any error message like in https://gist.github.com/mlazos/244e3d5b53667e44078e194762c0c92b) 3/python repro.py
-> no crash when ran :( Interested in where I made a mistake in the minifier usage?Original Python code: