Performance issue for row-major dense Tensors

mtsokol commented 2 months ago

@willow-ahrens,

Here I share performance issue that we found in SDDMM. tensordot on row-major has a huge time overhead (I think there's a format change for row-major).

Column-major example:

julia> using Finch;
julia> LEN = 1000;
julia> a = lazy(swizzle(Tensor(rand(LEN, LEN)), 1, 2));
julia> b = lazy(swizzle(Tensor(rand(LEN, LEN)), 1, 2));
julia> c = tensordot(a, b, ((2,), (1,)));
julia> res = compute(c);
julia> @time compute(c);
  0.370971 seconds (209 allocations: 7.640 MiB)

Row-major example:

julia> using Finch
julia> LEN = 1000;
julia> a = lazy(swizzle(Tensor(rand(LEN, LEN)), 2, 1));
julia> b = lazy(swizzle(Tensor(rand(LEN, LEN)), 2, 1));
julia> c = tensordot(a, b, ((2,), (1,)));
julia> res = compute(c);
julia> @time compute(c);
 73.628710 seconds (447 allocations: 568.656 MiB, 0.37% gc time)

willow-ahrens commented 2 months ago

with verbose mode:

:(function var"##compute#295"(prgm)
      begin
          V = (((((((((prgm.children[1]).children[2]).children[2]).children[3]).children[2]).children[1]).children[1]).children[1]).children[2]).tns.val::Tensor{DenseLevel{Int64, DenseLevel{Int64, ElementLevel{0.0, Float64, Int64, Vector{Float64}}}}}
          V_2 = (((((((((prgm.children[1]).children[2]).children[2]).children[3]).children[3]).children[1]).children[1]).children[1]).children[2]).tns.val::Tensor{DenseLevel{Int64, DenseLevel{Int64, ElementLevel{0.0, Float64, Int64, Vector{Float64}}}}}
          A0 = V::Tensor{DenseLevel{Int64, DenseLevel{Int64, ElementLevel{0.0, Float64, Int64, Vector{Float64}}}}}
          A0_2 = Tensor(Dense(SparseDict(Element{0.0, Float64}())))::Tensor{DenseLevel{Int64, SparseLevel{Int64, Finch.DictTable{Int64, Int64, Vector{Int64}, Vector{Int64}, Vector{Int64}, Dict{Tuple{Int64, Int64}, Int64}}, ElementLevel{0.0, Float64, Int64, Vector{Float64}}}}}
          @finch mode = :fast begin
                  A0_2 .= 0.0
                  for i1 = _
                      for i0 = _
                          A0_2[i1, i0] = A0[i0, i1]
                      end
                  end
                  return A0_2
              end
          A2 = V_2::Tensor{DenseLevel{Int64, DenseLevel{Int64, ElementLevel{0.0, Float64, Int64, Vector{Float64}}}}}
          A2_2 = Tensor(Dense(SparseDict(Element{0.0, Float64}())))::Tensor{DenseLevel{Int64, SparseLevel{Int64, Finch.DictTable{Int64, Int64, Vector{Int64}, Vector{Int64}, Vector{Int64}, Dict{Tuple{Int64, Int64}, Int64}}, ElementLevel{0.0, Float64, Int64, Vector{Float64}}}}}
          @finch mode = :fast begin
                  A2_2 .= 0.0
                  for i3 = _
                      for i2 = _
                          A2_2[i3, i2] = A2[i2, i3]
                      end
                  end
                  return A2_2
              end
          A4 = Tensor(Dense(SparseDict(Element{0.0, Float64}())))::Tensor{DenseLevel{Int64, SparseLevel{Int64, Finch.DictTable{Int64, Int64, Vector{Int64}, Vector{Int64}, Vector{Int64}, Dict{Tuple{Int64, Int64}, Int64}}, ElementLevel{0.0, Float64, Int64, Vector{Float64}}}}}
          @finch mode = :fast begin
                  A4 .= 0.0
                  for i10 = _
                      for i7 = _
                          for i6 = _
                              A4[i6, i10] << + >>= (*)(A0_2[i6, i7], A2_2[i7, i10])
                          end
                      end
                  end
                  return A4
              end
          return (A4,)
      end
  end)

willow-ahrens commented 2 months ago

Finch has somehow managed to turn this into a sparse-sparse problem

willow-ahrens commented 2 months ago

it looks like two failures happened during autoscheduling:

we have permuted even though we don't need to
we have sparsified when we shouldnt have

willow-ahrens / Finch.jl

Performance issue for row-major dense Tensors #525