willow-ahrens / Finch.jl

Sparse tensors in Julia and more! Datastructure-driven array programing language.
http://willowahrens.io/Finch.jl/
MIT License
151 stars 12 forks source link

When does Finch store fill values? #605

Open kylebd99 opened 2 weeks ago

kylebd99 commented 2 weeks ago

While optimizing a BFS, I've been realizing that the sparse level tends to accumulate unnecessary nonzeros. I'm not sure where exactly this is coming from, but it results in a significant slowdown over time.

using Finch
using MatrixDepot

matrix = "GAP/GAP-road"
A = Tensor(Dense(SparseList(Element(0))), SparseMatrixCSC(matrixdepot(matrix)))
n = size(A_int)[1]
B = Tensor(SparseList(Element(false)), fsprand(Bool, n, 100000))
D1 = Tensor(Sparse(Element(false)))
@finch begin
    D1 .= 0
    for i = _
        for j = _
            D1[j] <<choose(false)>>= A[walk(j),i] & B[i]
        end
    end
end
println("D1 Non-Default Values: ", sum(D1))
println("D1 Counstored: ", countstored(D1))

D2 = Tensor(Sparse(Element(false)))
@finch begin
    D2 .= 0
    for i = _
        for j = _
            D2[j] <<choose(false)>>= A[walk(j),i] & D1[i]
        end
    end
end
println("D2 Non-Default Values: ", sum(D2))
println("D2 Counstored: ", countstored(D2))

D3 = Tensor(Sparse(Element(false)))
@finch begin
    D3 .= 0
    for i = _
        for j = _
            D3[j] <<choose(false)>>= A[walk(j),i] & D2[i]
        end
    end
end
println("D3 Non-Default Values: ", sum(D3))
println("D3 Counstored: ", countstored(D3))

D4 = Tensor(Sparse(Element(false)))
@finch begin
    D4 .= 0
    for i = _
        for j = _
            D4[j] <<choose(false)>>= A[walk(j),i] & D3[i]
        end
    end
end

println("D4 Non-Default Values: ", sum(D4))
println("D4 Counstored: ", countstored(D4))
kylebd99 commented 2 weeks ago

Never mind, I think this was coming from the Int, Bool interaction. An improved version which properly initializes A doesn't see this happen,

using Finch
using MatrixDepot

matrix = "GAP/GAP-road"
A = Tensor(Dense(SparseList(Element(false))), SparseMatrixCSC(matrixdepot(matrix)) .!= 0 )
n = size(A_int)[1]
B = Tensor(SparseByteMap(Element(false)), SparseVector(fsprand(Bool, n, 100000)))
C1 = Tensor(SparseList(Element(false)), SparseVector(fsprand(Bool, n, 100000)))
C2 = Tensor(SparseList(Element(false)), SparseVector(fsprand(Bool, n, 100000)))
C3 = Tensor(Sparse(Element(false)), SparseVector(fsprand(Bool, n, 100000)))
D1 = Tensor(Sparse(Element(false)))
@finch begin
    D1 .= 0
    for i = _
        for j = _
            D1[j] <<choose(false)>>= A[walk(j),i] & B[i] & !(C1[follow(j)] | C2[follow(j)] | C3[follow(j)])
        end
    end
end
println("D1 Non-Default Values: ", sum(D1))
println("D1 Counstored: ", countstored(D1))

D2 = Tensor(Sparse(Element(false)))
@finch begin
    D2 .= 0
    for i = _
        for j = _
            D2[j] <<choose(false)>>= A[walk(j),i] & D1[i] & !(C1[follow(j)] | C2[follow(j)] | C3[follow(j)])
        end
    end
end
println("D2 Non-Default Values: ", sum(D2))
println("D2 Counstored: ", countstored(D2))

D3 = Tensor(Sparse(Element(false)))
@finch begin
    D3 .= 0
    for i = _
        for j = _
            D3[j] <<choose(false)>>= A[walk(j),i] & D2[i] & !(C1[follow(j)] | C2[follow(j)] | C3[follow(j)])
        end
    end
end
println("D3 Non-Default Values: ", sum(D3))
println("D3 Counstored: ", countstored(D3))

D4 = Tensor(Sparse(Element(false)))
@finch begin
    D4 .= 0
    for i = _
        for j = _
            D4[j] <<choose(false)>>= A[walk(j),i] & D3[i] & !(C1[follow(j)] | C2[follow(j)] | C3[follow(j)])
        end
    end
end

println("D4 Non-Default Values: ", sum(D4))
println("D4 Counstored: ", countstored(D4))
kylebd99 commented 2 weeks ago

Okay, wait, I've found the issue. I think the pruning due to the ! clause isn't being captured in the storage as implicit non zeros. If this is intended behavior, then that's totally fine and you can go ahead and re-close this issue.

Here's a new example which does capture it though,

using Finch
using MatrixDepot

matrix = "GAP/GAP-road"
A = Tensor(Dense(SparseList(Element(false))), SparseMatrixCSC(matrixdepot(matrix)) .!= 0 )
n = size(A_int)[1]
B = Tensor(Sparse(Element(false)), SparseVector(fsprand(Bool, n, 100)) .!= 0 )
C1 = Tensor(SparseList(Element(false)), SparseVector(fsprand(Bool, n, 1000000)) .!= 0 )
C2 = Tensor(SparseList(Element(false)), SparseVector(fsprand(Bool, n, 1000000)) .!= 0 )
C3 = Tensor(Sparse(Element(false)), SparseVector(fsprand(Bool, n, 1000000)) .!= 0 )
function test_nnz()
    old_D = B
    for i in 1:100
        new_D = Tensor(Sparse(Element(false)))
        @finch begin
            new_D .= 0
            for i = _
                for j = _
                    new_D[j] <<choose(false)>>= A[walk(j),i] & old_D[i] & !(C1[follow(j)] | C2[follow(j)] | C3[follow(j)])
                end
            end
        end
        old_D = new_D
        println("new_D Non-Default Values: ", sum(new_D))
        println("new_D Counstored: ", countstored(new_D))
    end
end

function test_nnz_fix()
    old_D = B
    for i in 1:100
        new_D = Tensor(Sparse(Element(false)))
        @finch begin
            new_D .= 0
            for i = _
                for j = _
                    new_D[j] <<choose(false)>>= A[walk(j),i] & old_D[i] & !(C1[follow(j)] | C2[follow(j)] | C3[follow(j)])
                end
            end
        end
        old_D = Tensor(Sparse(Element(false)), new_D)
        println("new_D Non-Default Values: ", sum(new_D))
        println("new_D Counstored: ", countstored(new_D))
    end
end

function test_nnz_fix2()
    old_D = B
    for i in 1:100
        new_D = Tensor(Sparse(Element(false)))
        @finch begin
            new_D .= 0
            for i = _
                for j = _
                    new_D[j] <<choose(false)>>= A[walk(j),i] & old_D[i]
                end
            end
        end
        old_D = new_D
        println("new_D Non-Default Values: ", sum(new_D))
        println("new_D Counstored: ", countstored(new_D))
    end
end

#test_nnz()
#test_nnz_fix()
test_nnz_fix2()
willow-ahrens commented 2 weeks ago

Is the the issue that

julia> A = Tensor(Sparse(Element(false)), pattern!(fsprand(Bool, 4, 2)))
4-Tensor
└─ SparseDict (false) [1:4]
   ├─ [1]: true
   └─ [4]: true

julia> B = Tensor(Sparse(Element(false)))
0-Tensor
└─ SparseDict (false) [1:0]

julia> @finch begin
           B .= 0
           for i = _
               B[i] <<choose(false)>>= !A[follow(i)]
           end
       end
(B = Tensor(SparseDict{Int64}(Element{false, Bool, Int64}(Bool[0, 1, 1, 0]), 4, [1, 5], [1, 2, 3, 4], [1, 2, 3, 4], Dict((1, 2) => 2, (1, 1) => 1, (1, 3) => 3, (1, 4) => 4), Int64[])),)

julia> B
4-Tensor
└─ SparseDict (false) [1:4]
   ├─ [1]: false
   ├─ [2]: true
   ├─ [3]: true
   └─ [4]: false

stores explicit false values?

The problem is that Finch can't statically guarantee that the explicit value of Element(false) is true, and thus doesn't know that ! would produce a fill value of false.

Fortunately, PatternLevel does guarantee that all stored values are true. Consider the following:

julia> A = pattern!(A)
4-Tensor
└─ SparseDict (false) [1:4]
   ├─ [1]: true
   └─ [4]: true

julia> @finch begin
           B .= 0
           for i = _
               B[i] <<choose(false)>>= !A[follow(i)]
           end
       end
(B = Tensor(SparseDict{Int64}(Element{false, Bool, Int64}(Bool[1, 1]), 4, [1, 3], [2, 3], [1, 2], Dict((1, 2) => 1, (1, 3) => 2), [3])),)

julia> B
4-Tensor
└─ SparseDict (false) [1:4]
   ├─ [2]: true
   └─ [3]: true

does that help? (Side note, ISO levels would be just like pattern levels, but for more values than true or false: https://github.com/willow-ahrens/Finch.jl/issues/341)

willow-ahrens commented 2 weeks ago

we can also lift dynamic values into statically known ones using if statements.

@finch begin
           B .= 0
           for i = _
               if !A[follow(i)]
                   B[i] <<choose(false)>>= true
               end
           end
       end

The above program only executes the increment statement when A is true, regardless of whether that is knowable statically.

Also, the dropfills! method may be helpful to you, as that is the method which Tensor(lvl, arr) calls to copy arr

willow-ahrens commented 2 weeks ago

In short, Finch doesn't look at values before storing them to check if they are fill. Instead, the store operation is what introduces explicit values into a tensor, regardless of what the value was. Thus, the result of countstored reflects how many stores Finch was able to compile away by the time we reach the assignment statement.

kylebd99 commented 2 weeks ago

Ah that makes sense, that answered my question. Thanks for the thorough response! In Galley, I've fixed this by just checking intermittently checking whether I would gain by compacting using dropfills!

In general, what is stopping Finch from including the if condition "expr != default" before every assignment automatically? Does that just introduce an unacceptable overhead generally?

willow-ahrens commented 2 weeks ago

We could benchmark how expensive that would be!

julia> A = Tensor(Dense(SparseList(Element(0.0))), fsprand(10_000, 10_000, 100_000))
x = 10000×10000-Tensor
└─ Dense [:,1:10000]
   ├─ [:, 1]: SparseList (0.0) [1:10000]
   │  ├─ [422]: 0.0418729
   │  ├─ [868]: 0.804566
   │  ├─ ⋮
   │  ├─ [9533]: 0.0671509
   │  └─ [9665]: 0.723424
   ├─ [:, 2]: SparseList (0.0) [1:10000]
   │  ├─ [1673]: 0.528857
   │  ├─ [3411]: 0.576459
   │  ├─ ⋮
   │  ├─ [9553]: 0.597155
   │  └─ [9753]: 0.726254
   ├─ ⋮
   ├─ [:, 9999]: SparseList (0.0) [1:10000]
   │  ├─ [226]: 0.870514
   │  ├─ [1489]: 0.239856
   │  ├─ ⋮
   │  ├─ [7672]: 0.221962
   │  └─ [9740]: 0.17364
   └─ [:, 10000]: SparseList (0.0) [1:10000]
      ├─ [500]: 0.361668
      ├─ [2668]: 0.232563
      ├─ ⋮
      ├─ [9119]: 0.302905
      └─ [9128]: 0.810383

julia> x = Tensor(Dense(Element(0.0)), rand(10_000))
10000-Tensor
└─ Dense [1:10000]
   ├─ [1]: 0.465509
   ├─ [2]: 0.702242
   ├─ ⋮
   ├─ [9999]: 0.321236
   └─ [10000]: 0.71483

julia> y = Tensor(Dense(Element(0.0)))
0-Tensor
└─ Dense [1:0]

julia> @benchmark begin
           (y, A, x) = ($y, $A, $x)
           @finch begin
               y .= 0
               for j = _, i = _
                   y[j] += A[i, j] * x[i]
               end
           end
       end
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  100.000 μs … 144.334 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     101.375 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   102.333 μs ±   2.792 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

   ▅▇██▇█▆▅▄▃▄▄▄▃▂▃▃▃▃▃▃▁▁▁▁▂▁▁▁ ▁                              ▃
  █████████████████████████████████▆▇▇▇███▇▇▇▇▇▇▇▇▆▅▆▅▆▇▅▅▄▅▅▄▆ █
  100 μs        Histogram: log(frequency) by time        114 μs <

 Memory estimate: 160 bytes, allocs estimate: 3.

julia> @benchmark begin
           (y, A, x) = ($y, $A, $x)
           @finch begin
               y .= 0
               for j = _, i = _
                   let a = A[i, j]
                       if a != 0.0
                           y[j] += a * x[i]
                       end
                   end
               end
           end
       end
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  133.625 μs … 264.333 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     134.542 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   136.187 μs ±   3.610 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

   ▅▇█▆▅▃▁        ▂▄▄▂       ▂▃▃▃▂▁                             ▂
  ▆███████▇▆▆▅▆▅▂▇██████▇▆▅▆▇██████████▇▇▆▆▆████▇█▇▇▇▇▇▆▅▆▆▅▆▆▅ █
  134 μs        Histogram: log(frequency) by time        148 μs <

 Memory estimate: 160 bytes, allocs estimate: 3.
kylebd99 commented 2 weeks ago

That's super interesting. Looks like too much of an overhead to make it unavoidable, but worth a benchmark for sure.