csarofeen / pytorch

Tensors and Dynamic neural networks in Python with strong GPU acceleration
http://pytorch.org
Other
26 stars 7 forks source link

Codegen bug in GPT2ForSequenceClassification #2074

Closed kevinstephano closed 2 years ago

kevinstephano commented 2 years ago

šŸ› Describe the bug

import torch
from torch._C._nvfuser import FusionDefinition, Fusion, DataType

def nvfuser_fusion_id0(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(symbolic_sizes=[-1, -1], contiguous=[True, True], dtype=DataType.Int)
    T1 = fd.ops.view(T0, original_shape=[4, 1024], new_shape=[-1, 1024])
    S2 = fd.define_constant(0)
    T3 = fd.ops.ne(T0, S2)
    T4 = fd.ops.cast(T3, dtype=DataType.Int)
    T5 = fd.ops.sum(T4, axes=[1], keepdim=False, dtype=DataType.Null)
    S6 = fd.define_constant(1)
    T7 = fd.ops.sub(T5, S6)
    fd.add_output(T1)
    fd.add_output(T7)

inputs = [
    torch.randn(4, 1024, device='cuda'),
]

fs = Fusion()
with FusionDefinition(fs) as fd:
    nvfuser_fusion_id0(fd)

for _ in range(5) :
    fs.execute(inputs)

Error:

RuntimeError: tv->definition() == nullptr || (tv->definition()->isA<UnaryOp>() && tv->definition()->as<UnaryOp>()->getUnaryOpType() == UnaryOpType::Set) || tv->definition()->isA<LoadStoreOp>() INTERNAL ASSERT FAILED at "/opt/pytorch/pytorch/torch/csrc/jit/codegen/cuda/lower_validation.cpp":566, please report a bug to PyTorch. Vectorized accesses cannot be inline with computation, they are only supported with a Set operation.TensorView: T1_l[ iblockIdx.y60{T0.size[0]}, iS46{( ceilDiv(( ceilDiv(( ceilDiv(( ceilDiv(T0.size[1], 2) ), blockDim.x) ), 1) ), gridDim.x) )}, iblockIdx.x45{gridDim.x}, ithreadIdx.x42{blockDim.x}, iUS44{1}, iV40{2} ] ca_pos( 5 )

Versions

TOT

csarofeen commented 2 years ago

Repros on devel:

TEST_F(NVFuserTest, FusionIssue2074_CUDA) {
  auto fusion_ptr = std::make_unique<Fusion>();
  Fusion& fusion = *fusion_ptr.get();
  FusionGuard fg(&fusion);

  int x = 4, y = 1024;

  auto tv0 = makeContigTensor(2, DataType::Int32);
  fusion.addInput(tv0);
  auto tv1 = ne(tv0, IrBuilder::create<Int>(0));
  auto tv2 = castOp(DataType::Int32, tv1);
  auto tv3 = sum(tv2, {1});
  auto tv4 = sub(tv3, IrBuilder::create<Int>(1));
  fusion.addOutput(tv0);
  fusion.addOutput(tv4);

  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);

  at::Tensor t0 = at::randn({x, y}, options).to(at::kInt);
  auto t1 = t0.ne(0);
  auto t2 = t1.to(at::kInt);
  auto t3 = t2.sum({1});
  auto t4 = t3 - 1;

  FusionExecutorCache executor_cache(std::move(fusion_ptr));
  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
  ASSERT_TRUE(at::allclose(cg_outputs[1], t4));
}