tenstorrent / tt-metal

:metal: TT-NN operator library, and TT-Metalium low level kernel programming model.
https://docs.tenstorrent.com/ttnn/latest/index.html
Apache License 2.0
485 stars 79 forks source link

ttnn.relu low PCC in some cases when using tensor height and width as shard shape and column major shard orientation #15423

Open amalbasaTT opened 1 day ago

amalbasaTT commented 1 day ago

To Reproduce Steps to reproduce the behavior:

  1. Checkout branch amalbasaTT/unary_sharded-sweeps-2 (soon to be merged to main)
  2. Copy the unit test below to test_relu_sharded.py:
    
    import torch
    import random
    import ttnn
    import itertools
    import pytest
    import traceback
    import math
    from loguru import logger
    from functools import partial

from tests.sweep_framework.sweep_utils.utils import gen_shapes, get_device_grid_size, get_sharded_config from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt, _gen_reshape_args_from_volume from tests.ttnn.utils_for_testing import check_with_pcc from models.utility_functions import torch_random

Y, X = get_device_grid_size() DEVICE_GRID_SIZE = ttnn.CoreGrid(y=Y, x=X)

def run_relu_sharded_tests( input_shape, dtype, dlayout, core_size, tensor_height_width_as_shard_shape, shard_orientation, data_seed, device, ): torch.manual_seed(data_seed)

x = gen_func_with_cast_tt(
    partial(torch_random, low=-100, high=100, dtype=torch.bfloat16), dtype
)(input_shape)

try:
    ref_value = torch.nn.functional.relu(x)

    mem_cfg = ttnn.create_sharded_memory_config(
        shape=input_shape,
        core_grid=ttnn.CoreGrid(y=core_size[0], x=core_size[1]),
        strategy=ttnn.ShardStrategy.BLOCK,
        orientation=shard_orientation,
        use_height_and_width_as_shard_shape=tensor_height_width_as_shard_shape,
    )

    tt_x = ttnn.from_torch(
        x,
        dtype=dtype,
        layout=dlayout,
        device=device,
        memory_config=mem_cfg,
    )

    tt_result = ttnn.relu(tt_x, memory_config=mem_cfg)
    tt_result = ttnn.to_torch(tt_result)

except Exception as e:
    logger.warning(f"Test execution crashed: {e}")
    print(traceback.format_exc())
    raise e

passed, output_str = check_with_pcc(x, ttnn.to_torch(tt_x), 1.0)
assert passed, f"Failed before ttnn.relu {output_str}, {data_seed}, {input_shape}, {dtype}, {mem_cfg.shard_spec}"
passed, output_str = check_with_pcc(ref_value, tt_result, 0.999)
assert passed, f"Failed at ttnn.relu, {output_str}, {data_seed}, {input_shape}, {dtype}, {mem_cfg.shard_spec}"

test_sweep_args = [ ( [3, 2, 256, 320], ttnn.bfloat16, ttnn.TILE_LAYOUT, (8, 1), True, ttnn.ShardOrientation.COL_MAJOR, 5863207, ), ( [3, 2, 256, 320], ttnn.bfloat8_b, ttnn.TILE_LAYOUT, (8, 1), True, ttnn.ShardOrientation.COL_MAJOR, 8320078, ), ( [288, 32], ttnn.bfloat16, ttnn.TILE_LAYOUT, (8, 1), True, ttnn.ShardOrientation.COL_MAJOR, 11924152, ), ( [2, 3, 224, 64], ttnn.bfloat16, ttnn.TILE_LAYOUT, (8, 1), True, ttnn.ShardOrientation.COL_MAJOR, 14234094, ), ( [2, 3, 224, 64], ttnn.bfloat8_b, ttnn.TILE_LAYOUT, (8, 1), True, ttnn.ShardOrientation.COL_MAJOR, 15818731, ), ( [16, 256, 128], ttnn.bfloat16, ttnn.TILE_LAYOUT, (2, 8), True, ttnn.ShardOrientation.COL_MAJOR, 3965624, ), ( [16, 256, 128], ttnn.bfloat8_b, ttnn.TILE_LAYOUT, (2, 8), True, ttnn.ShardOrientation.COL_MAJOR, 17790071, ), ]

@pytest.mark.parametrize( "input_shape, dtype, dlayout, core_size, tensor_height_width_as_shard_shape, shard_orientation, data_seed", (test_sweep_args), ) def test_relu_sharded(input_shape, dtype, dlayout, core_size, tensor_height_width_as_shard_shape, shard_orientation, data_seed, device): run_relu_sharded_tests(input_shape, dtype, dlayout, core_size, tensor_height_width_as_shard_shape, shard_orientation, data_seed, device)

3. Run it with command:

pytest test_relu_sharded.py

Expected behavior All test cases should fail.

umadevimcw commented 13 hours ago

@amalbasaTT isn't this issue similar to #15159 ?

amalbasaTT commented 10 hours ago

Not exactly, issue you are mentioning has clear conditions which when met are always reproducing said issue, and that is when using sharding strategies and when the second to innermost dimension is not divisible by 32. This issue happens in some cases when using column major orientation and tensor height width as shard shape. But i typed the wrong title, so I'll fix that.