Closed jjsjann123 closed 28 minutes ago
Adding another python example for thunder einsum that fails when my simple allocation domain propagation is enabled. Note that this is only for my own reference for repro.
import torch
from nvfuser import FusionDefinition, DataType
def nvfuser_fusion_id33(fd : FusionDefinition) -> None :
T0 = fd.define_tensor(shape=[1, -1, -1, -1], contiguity=[None, True, True, True], dtype=DataType.Float, is_cpu=False, stride_order=[3, 2, 1, 0])
T1 = fd.define_tensor(shape=[1, 1, -1], contiguity=[None, None, True], dtype=DataType.Float, is_cpu=False, stride_order=[2, 1, 0])
T2 = fd.define_tensor(shape=[-1, 1, -1], contiguity=[True, None, True], dtype=DataType.Float, is_cpu=False, stride_order=[2, 1, 0])
T3 = fd.ops.sum(T2, dims=[1], keepdim=False, dtype=DataType.Null)
T4 = fd.ops.permute(T3, dims=[1, 0])
S5 = fd.define_scalar(1, dtype=DataType.Int)
S6 = fd.define_scalar(1, dtype=DataType.Int)
S7 = fd.define_scalar(1, dtype=DataType.Int)
S8 = fd.define_scalar(2, dtype=DataType.Int)
S9 = fd.define_scalar(2, dtype=DataType.Int)
V10 = fd.define_vector([S5, S6, S7, S8, S9], dtype=DataType.Int)
T11 = fd.ops.reshape(T4, new_shape=V10)
S12 = fd.define_scalar(1, dtype=DataType.Int)
S13 = fd.define_scalar(4, dtype=DataType.Int)
S14 = fd.define_scalar(5, dtype=DataType.Int)
S15 = fd.define_scalar(2, dtype=DataType.Int)
S16 = fd.define_scalar(2, dtype=DataType.Int)
V17 = fd.define_vector([S12, S13, S14, S15, S16], dtype=DataType.Int)
T18 = fd.ops.broadcast_in_dim(T11, shape=V17, broadcast_dims=[0, 1, 2, 3, 4])
S19 = fd.define_scalar(1.00000, dtype=DataType.Double)
S20 = fd.define_scalar(1, dtype=DataType.Int)
S21 = fd.define_scalar(4, dtype=DataType.Int)
S22 = fd.define_scalar(5, dtype=DataType.Int)
S23 = fd.define_scalar(2, dtype=DataType.Int)
S24 = fd.define_scalar(2, dtype=DataType.Int)
V25 = fd.define_vector([S20, S21, S22, S23, S24], dtype=DataType.Int)
T26 = fd.ops.full(shape=V25, fill_value=S19, dtype=DataType.Float)
T27 = fd.ops.permute(T26, dims=[0, 1, 2, 4, 3])
T28 = fd.ops.mul(T18, T27)
T29 = fd.ops.sum(T28, dims=[0, 3, 4], keepdim=False, dtype=DataType.Null)
S30 = fd.define_scalar(1, dtype=DataType.Int)
S31 = fd.define_scalar(4, dtype=DataType.Int)
S32 = fd.define_scalar(5, dtype=DataType.Int)
S33 = fd.define_scalar(1, dtype=DataType.Int)
S34 = fd.define_scalar(1, dtype=DataType.Int)
V35 = fd.define_vector([S30, S31, S32, S33, S34], dtype=DataType.Int)
T36 = fd.ops.broadcast_in_dim(T29, shape=V35, broadcast_dims=[1, 2])
S37 = fd.define_scalar(1, dtype=DataType.Int)
S38 = fd.define_scalar(4, dtype=DataType.Int)
S39 = fd.define_scalar(5, dtype=DataType.Int)
V40 = fd.define_vector([S37, S38, S39], dtype=DataType.Int)
T41 = fd.ops.reshape(T36, new_shape=V40)
T42 = fd.ops.permute(T41, dims=[2, 0, 1])
S43 = fd.define_scalar(3, dtype=DataType.Int)
S44 = fd.define_scalar(5, dtype=DataType.Int)
S45 = fd.define_scalar(1, dtype=DataType.Int)
S46 = fd.define_scalar(4, dtype=DataType.Int)
V47 = fd.define_vector([S43, S44, S45, S46], dtype=DataType.Int)
T48 = fd.ops.broadcast_in_dim(T42, shape=V47, broadcast_dims=[1, 2, 3])
T49 = fd.ops.sum(T1, dims=[0], keepdim=False, dtype=DataType.Null)
S50 = fd.define_scalar(1, dtype=DataType.Int)
S51 = fd.define_scalar(5, dtype=DataType.Int)
S52 = fd.define_scalar(1, dtype=DataType.Int)
S53 = fd.define_scalar(1, dtype=DataType.Int)
V54 = fd.define_vector([S50, S51, S52, S53], dtype=DataType.Int)
T55 = fd.ops.reshape(T49, new_shape=V54)
S56 = fd.define_scalar(3, dtype=DataType.Int)
S57 = fd.define_scalar(5, dtype=DataType.Int)
S58 = fd.define_scalar(1, dtype=DataType.Int)
S59 = fd.define_scalar(4, dtype=DataType.Int)
V60 = fd.define_vector([S56, S57, S58, S59], dtype=DataType.Int)
T61 = fd.ops.broadcast_in_dim(T55, shape=V60, broadcast_dims=[0, 1, 2, 3])
T62 = fd.ops.mul(T61, T48)
T63 = fd.ops.sum(T62, dims=[1, 2], keepdim=False, dtype=DataType.Null)
T64 = fd.ops.sum(T0, dims=[1], keepdim=False, dtype=DataType.Null)
T65 = fd.ops.permute(T64, dims=[1, 0, 2])
S66 = fd.define_scalar(3, dtype=DataType.Int)
S67 = fd.define_scalar(1, dtype=DataType.Int)
S68 = fd.define_scalar(1, dtype=DataType.Int)
S69 = fd.define_scalar(4, dtype=DataType.Int)
V70 = fd.define_vector([S66, S67, S68, S69], dtype=DataType.Int)
T71 = fd.ops.reshape(T65, new_shape=V70)
S72 = fd.define_scalar(3, dtype=DataType.Int)
S73 = fd.define_scalar(5, dtype=DataType.Int)
S74 = fd.define_scalar(1, dtype=DataType.Int)
S75 = fd.define_scalar(4, dtype=DataType.Int)
V76 = fd.define_vector([S72, S73, S74, S75], dtype=DataType.Int)
T77 = fd.ops.broadcast_in_dim(T71, shape=V76, broadcast_dims=[0, 1, 2, 3])
T78 = fd.ops.mul(T77, T48)
T79 = fd.ops.sum(T78, dims=[0, 2, 3], keepdim=False, dtype=DataType.Null)
T80 = fd.ops.mul(T61, T77)
T81 = fd.ops.sum(T80, dims=[0], keepdim=False, dtype=DataType.Null)
T82 = fd.ops.permute(T81, dims=[1, 2, 0])
S83 = fd.define_scalar(1, dtype=DataType.Int)
S84 = fd.define_scalar(4, dtype=DataType.Int)
S85 = fd.define_scalar(5, dtype=DataType.Int)
S86 = fd.define_scalar(1, dtype=DataType.Int)
S87 = fd.define_scalar(1, dtype=DataType.Int)
V88 = fd.define_vector([S83, S84, S85, S86, S87], dtype=DataType.Int)
T89 = fd.ops.reshape(T82, new_shape=V88)
S90 = fd.define_scalar(1, dtype=DataType.Int)
S91 = fd.define_scalar(4, dtype=DataType.Int)
S92 = fd.define_scalar(5, dtype=DataType.Int)
S93 = fd.define_scalar(2, dtype=DataType.Int)
S94 = fd.define_scalar(2, dtype=DataType.Int)
V95 = fd.define_vector([S90, S91, S92, S93, S94], dtype=DataType.Int)
T96 = fd.ops.broadcast_in_dim(T89, shape=V95, broadcast_dims=[0, 1, 2, 3, 4])
T97 = fd.ops.mul(T96, T27)
T98 = fd.ops.sum(T97, dims=[0, 1, 2], keepdim=False, dtype=DataType.Null)
fd.add_output(T79)
fd.add_output(T63)
fd.add_output(T98)
with FusionDefinition() as fd:
nvfuser_fusion_id33(fd)
inputs = [
torch.randn((24,), dtype=torch.float32, device='cuda:0').as_strided((1, 2, 3, 4), (24, 12, 4, 1)),
torch.randn((5,), dtype=torch.float32, device='cuda:0').as_strided((1, 1, 5), (5, 5, 1)),
torch.randn((4,), dtype=torch.float32, device='cuda:0').as_strided((2, 1, 2), (2, 2, 1)),
]
fd.execute(inputs)
cc'ing @liqiangxl if this is something that you might be interested. :stuck_out_tongue_winking_eye:
We should get a couple reduction/normalization scheduler and iterate through permutation on allocation domain (from rfactor) for inputs/outputs. That should verify that schedulers are properly handling allocation domain.
We should get a couple reduction/normalization scheduler and iterate through permutation on allocation domain (from rfactor) for inputs/outputs. That should verify that schedulers are properly handling allocation domain.
Highlight comment from https://github.com/NVIDIA/Fuser/pull/2200#discussion_r1591810046 regarding comprehensive test cases for allocation domain support in reduction scheduler.
We'll want some representative end-2-end tests to validate our functional support.
Could you add the error message here?
The old assert happens here: https://github.com/NVIDIA/Fuser/blob/5af71046e8993adcb0a36eb35aaa74d864d10df8/csrc/scheduler/utils.cpp#L1051
That one can be patched with #2201 . But afterwards, our vectorization is wrong since input tensor cannot be vectorized. This is not surprising, since looking at reduction output as the reference, the scheduler mistakenly thought this is an inner reduction.
So, with #2201, the issue is now with the vectorization analysis?
So, with #2201, the issue is now with the vectorization analysis?
That's what is failing right now, admittedly it is a problem.
Though I don't think that's the root cause. #2201 just patched the assert, but the scheduler is still wrong... I think reduction scheduler should be looking at reduction inputs instead of its output to decide which reduction to use (inner vs outer), does that sound right to you as well?
Admittedly, I need to refresh my memory about the reduction scheduler but why does it matter? Because the input and output may have different orderings of allocation domains?
Admittedly, I need to refresh my memory about the reduction scheduler but why does it matter? Because the input and output may have different orderings of allocation domains?
exactly and that would change how the read pattern is, which I think dictates which variant of reduction scheduler that we should use.
I think it would be nice to have some simple concrete example.
Sounds good. I'll get that in place to help with the discussion.
TEST_F(AllocationDomainTest, PlayGround) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
TensorView* in = makeContigConcreteTensor({16, 1024, 1024});
TensorView* sum_out = sum(in, {0, 1});
sum_out->setAllocationDomain(
{sum_out->axis(2), sum_out->axis(0), sum_out->axis(1)}, true);
fusion->addInput(in);
fusion->addOutput(sum_out);
FusionExecutorCache fec(std::move(fusion));
at::Tensor in_tensor =
at::randn({16 * 1024 * 1024})
.cuda()
.as_strided({16, 1024, 1024}, {1024 * 1024, 1024, 1});
std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}
Does this simpler example look better?
TensorView* sum_out = sum(in, {0, 1});
is an outer reduction on the input tensor, but is mistakenly treated as an inner reduction when analyzing sum_out
sum_out->setAllocationDomain({sum_out->axis(2), sum_out->axis(0), sum_out->axis(1)}, true);
When would this happen in practice? sum_out
doesn't actually allocate the reduction domains, so I wonder what would cause this ordering.
sum_out->setAllocationDomain({sum_out->axis(2), sum_out->axis(0), sum_out->axis(1)}, true);
When would this happen in practice?
sum_out
doesn't actually allocate the reduction domains, so I wonder what would cause this ordering.
^^^ That's a valid argument. Do we want to have reduction domains in allocation domain at all and what meaning do they carry?
To keep the question simpler, we can assume that the order is propagate from inputs.
How about this example
TEST_F(AllocationDomainTest, PlayGround) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
TensorView* in = makeContigConcreteTensor({16, 1024, 1024});
in->setAllocationDomain(
{in->axis(2), in->axis(0), in->axis(1)}, true);
TensorView* sum_out = sum(in, {0, 1});
sum_out->setAllocationDomain(
{sum_out->axis(2), sum_out->axis(0), sum_out->axis(1)}, true);
fusion->addInput(in);
fusion->addOutput(sum_out);
FusionExecutorCache fec(std::move(fusion));
at::Tensor in_tensor =
at::randn({16 * 1024 * 1024})
.cuda()
.as_strided({16, 1024, 1024}, {1024 * 1024, 1024, 1});
std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}
We can see here that the allocation order is is propagated from input to output. So maybe it makes more sense?!
On ToT, this example triggers the same issue in merge_3d(tv) == 3
assert. But it is patched with #2201 . I hope it carries over the idea.
That's a valid argument. Do we want to have reduction domains in allocation domain at all and what meaning do they carry?
Yeah, we don't allocate those domains, so it seems to make more sense to drop them from allocation domains.
In this example, IIUC, both of the input and output just look like an inner reduction, so nothing inconsistent?
That's a valid argument. Do we want to have reduction domains in allocation domain at all and what meaning do they carry?
Yeah, we don't allocate those domains, so it seems to make more sense to drop them from allocation domains.
In this example, IIUC, both of the input and output just look like an inner reduction, so nothing inconsistent?
You are right. The second example does not have the inconsistency. The problem with this one is that, our reductoin scheduler isn't handling allocation domain properly and that's where the assert happens.
So really there're two problems:
For performance issue, we can look at the example below, these two reduction are identical in concept.
TEST_F(AllocationDomainTest, PlayGround) {
preseg_passes::OptimizationPassGuard<preseg_passes::AllocationDomainPass> guard(false);
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
TensorView* in = makeContigConcreteTensor({16*1024, 1024*36});
in->setAllocationDomain(
{in->axis(1), in->axis(0)}, true);
TensorView* sum_out = sum(in, {1});
fusion->addInput(in);
fusion->addOutput(sum_out);
FusionExecutorCache fec(std::move(fusion));
at::Tensor in_tensor =
at::randn({16 * 1024 * 1024 * 36})
.cuda()
.as_strided({16 * 1024, 1024 * 36}, {1, 1024*16});
std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}
TEST_F(AllocationDomainTest, PlayGround_ref) {
preseg_passes::OptimizationPassGuard<preseg_passes::AllocationDomainPass> guard(false);
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
TensorView* in = makeContigConcreteTensor({1024*36, 16*1024});
TensorView* sum_out = sum(in, {0});
fusion->addInput(in);
fusion->addOutput(sum_out);
FusionExecutorCache fec(std::move(fusion));
at::Tensor in_tensor =
at::randn({16 * 1024 * 1024 * 36})
.cuda()
.as_strided({1024 * 36, 16 * 1024}, {1024*16, 1});
std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}
But PlayGround
runs through a inner reduction with
kernelreduction_f0_c1_r0_g0 run in 11.7279 ms, achieved: 206.004 GB/s
Meanwhile, PlayGround_ref
runs through outer reduction with
kernelreduction_f0_c1_r0_g0 run in 1.65069 ms, achieved: 1463.62 GB/s
This is the simple logic our reduction scheduler uses to choose between inner vs outer reduction https://github.com/NVIDIA/Fuser/blob/66b2a0a586c6910fcc761037660a1ac2e06bae04/csrc/scheduler/utils.cpp#L2224-L2239.
FYI, the reference tv
in the function above is the output of a reduction.
The examples make sense, but isn't the performance problem fixed by the allocation order propagation?
Touche. We can discuss whether we want to prioritize the performance issue, maybe it's something we can wait until the issue shows up. I'm wondering how channels last vision models would behave with this (convolution followed by normalization).
Allocation order propagation doesn't always fix the performance problem in practice. There could be user specify output format, then it's not something the propagation can change; Also the propagation was done at the whole fusion segment right now, so it could struggle to figure out the right TVs to propagate from/to.
I'll follow up with the performance issue on a separate thread.
So back to the original issue this one is about. Scheduler should at least be functional and we need to patch that.
Allocation order propagation doesn't always fix the performance problem in practice. There could be user specify output format, then it's not something the propagation can change; Also the propagation was done at the whole fusion segment right now, so it could struggle to figure out the right TVs to propagate from/to.
This sounds like a user specifies different allocation orderings between inputs and outputs. There doesn't seem to be any obviously right decision here, i.e., whether the input order or the output order should be adopted. I hope this is uncommon.
Scheduler should at least be functional and we need to patch that.
Is it fixed by #2201?
Scheduler should at least be functional and we need to patch that. Is it fixed by https://github.com/NVIDIA/Fuser/pull/2201?
It's not properly fixed. The cpp example in that PR still have wrong vectorization after the patch.
Note of offline discussion with @jjsjann123: Plan to solve this issue is organized as the following steps: (1) revise #2201 to ensure the correct reduction scheduler is selected based on alloc domain. (2) ensure the vectorization analysis is correct (3) Needs more discussion on how to moveforward if the user specifies a different alloc domain on output, may need reduction + transpose
Just as a note, we did improve some allocation domain support in reduction, but the original issue still repros.
Found an repro here:
I tried to patch it with just fixing the reduction properties: #2201 , but that hit another issue with vectorization.
I realized that the reduction scheduler assumes the inputs to reduction operation to have the same allocation domain as with its output, which is not true. IIUC, inner reduction means that the reduction dimension contains the inner most iter domain on inputs, rather than its output.