nvukobratTT commented 6 days ago

Summary

Llama 3B, MLP block uses non-tile-aligned shapes for change rank ops; (un)squeeze. Error is happening during metal runtime.

Sample of the assert we're hitting:

E       RuntimeError: TT_THROW @ /localdev/nvukobrat/tt-forge-fe/third_party/tt-mlir/third_party/tt-metal/src/tt-metal/ttnn/cpp/ttnn/operations/core/core.cpp:49: tt::exception
E       info:
E       Unable to reshape a tensor in TILE_LAYOUT to non-tile height and width! Please convert the tensor to ROW_MAJOR_LAYOUT first.

Repro

Squeeze IR:

module @Squeeze attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = 8x8, l1_size = 1499136, num_dram_channels = 12, dram_channel_size = 1073741824, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32, l1_unreserved_base = 1024, erisc_l1_unreserved_base = 1024, dram_unreserved_base = 1024, dram_unreserved_end = 1073741824, physical_cores = {worker = [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (2, 0), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (3, 0), (3, 1), (3, 2), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (4, 0), (4, 1), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (5, 0), (5, 1), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (6, 0), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (7, 0), (7, 1), (7, 2), (7, 3), (7, 4), (7, 5), (7, 6), (7, 7)] dram = [(8, 0), (9, 0), (10, 0), (8, 1), (9, 1), (10, 1), (8, 2), (9, 2), (10, 2), (8, 3), (9, 3), (10, 3)]}, supported_data_types = [<f32>, <f16>, <bf16>, <bfp_f8>, <bfp_bf8>, <bfp_f4>, <bfp_bf4>, <bfp_f2>, <bfp_bf2>, <u32>, <u16>, <u8>], supported_tile_sizes = [(4 x 16), (16 x 16), (32 x 16), (4 x 32), (16 x 32), (32 x 32)]}], [0], [3 : i32], [<0, 0, 0, 0>]>} {
  func.func @forward(%arg0: tensor<1x12x3200xf32> {ttir.name = "a"}) -> tensor<12x3200xf32> {
    %0 = tensor.empty() : tensor<12x3200xf32>
    %1 = "ttir.squeeze"(%arg0, %0) <{dim = 0 : si32, operand_constraints = [#tt.operand_constraint<dram|l1|scalar|tile|none|interleaved|single_bank|height_sharded|width_sharded|block_sharded|any_layout|any_device|any_device_tile|l1_block_sharded>, #tt.operand_constraint<dram|l1|scalar|tile|none|interleaved|single_bank|height_sharded|width_sharded|block_sharded|any_layout|any_device|any_device_tile|l1_block_sharded>]}> : (tensor<1x12x3200xf32>, tensor<12x3200xf32>) -> tensor<12x3200xf32>
    return %1 : tensor<12x3200xf32>
  }
}

Unsqueeze IR:

module @Unsqueeze attributes {tt.system_desc = #tt.system_desc<[{arch = <wormhole_b0>, grid = 8x8, l1_size = 1499136, num_dram_channels = 12, dram_channel_size = 1073741824, noc_l1_address_align_bytes = 16, pcie_address_align_bytes = 32, noc_dram_address_align_bytes = 32, l1_unreserved_base = 1024, erisc_l1_unreserved_base = 1024, dram_unreserved_base = 1024, dram_unreserved_end = 1073741824, physical_cores = {worker = [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (2, 0), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (3, 0), (3, 1), (3, 2), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (4, 0), (4, 1), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (5, 0), (5, 1), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (6, 0), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (7, 0), (7, 1), (7, 2), (7, 3), (7, 4), (7, 5), (7, 6), (7, 7)] dram = [(8, 0), (9, 0), (10, 0), (8, 1), (9, 1), (10, 1), (8, 2), (9, 2), (10, 2), (8, 3), (9, 3), (10, 3)]}, supported_data_types = [<f32>, <f16>, <bf16>, <bfp_f8>, <bfp_bf8>, <bfp_f4>, <bfp_bf4>, <bfp_f2>, <bfp_bf2>, <u32>, <u16>, <u8>], supported_tile_sizes = [(4 x 16), (16 x 16), (32 x 16), (4 x 32), (16 x 32), (32 x 32)]}], [0], [3 : i32], [<0, 0, 0, 0>]>} {
  func.func @forward(%arg0: tensor<12x8640xf32> {ttir.name = "a"}) -> tensor<1x12x8640xf32> {
    %0 = tensor.empty() : tensor<1x12x8640xf32>
    %1 = "ttir.unsqueeze"(%arg0, %0) <{dim = 0 : si32, operand_constraints = [#tt.operand_constraint<dram|l1|scalar|tile|none|interleaved|single_bank|height_sharded|width_sharded|block_sharded|any_layout|any_device|any_device_tile|l1_block_sharded>, #tt.operand_constraint<dram|l1|scalar|tile|none|interleaved|single_bank|height_sharded|width_sharded|block_sharded|any_layout|any_device|any_device_tile|l1_block_sharded>]}> : (tensor<12x8640xf32>, tensor<1x12x8640xf32>) -> tensor<1x12x8640xf32>
    return %1 : tensor<1x12x8640xf32>
  }
}

sdjordjevicTT commented 6 days ago

@nvukobratTT can you please provide the generated MLIR?

nvukobratTT commented 6 days ago

@nvukobratTT can you please provide the generated MLIR?

I just updated issue description with more details. Let me know if you need any more details :))

tenstorrent / tt-mlir

Tensor layout issues with change rank ops, (un)squeeze, when they aren't tile-dim aligned #669

Summary

Repro