Speed up mul! for Jacobi matrices (X, Y)

MikaelSlevinsky commented 1 day ago

The following code compares the mul! speed of banded-block-banded matrices to the diagonal. For ChebyshevT, the diagonal mul! is mathematically only 3x for X and 9x for Y given the tridagonal-block-tridiagonal structure, but there appears to be a large overhead to it all. Moreover, mul! with views may spend a lot of time with the garbage collector and can be slower even though they require less data. I also don't see why mul! with X is not 3x faster than that with Y. Finally, mul! with an adjoint view is basically unusable (O(n^4) complexity?).

julia> using ClassicalOrthogonalPolynomials, MultivariateOrthogonalPolynomials, LinearAlgebra

julia> T² = RectPolynomial(ChebyshevT(), ChebyshevT());

julia> Jxt = jacobimatrix(Val(1), T²);

julia> Jyt = jacobimatrix(Val(2), T²);

julia> function jacobi_mul!(y, Z, x)
           n = blocksize(Z, 1)
           ZB1 = Z[Block(1), Block(1)]
           k = size(ZB1, 1)
           vx = view(x, 1:k)
           vy = view(y, 1:k)
           mul!(vy, ZB1, vx)
           ZB2 = Z[Block(1), Block(2)]
           vx = view(x, k+1:(k+size(ZB2, 2)))
           mul!(vy, ZB2, vx, 1, 1)
           Kstart = 1
           Kstop = k
           for j in 2:n-1
               ZBjm1 = Z[Block(j), Block(j-1)]
               vx = view(x, Kstart:Kstop)
               ZBj = Z[Block(j), Block(j)]
               Kstart = Kstop + 1
               Kstop = Kstop + size(ZBj, 2)
               vy = view(y, Kstart:Kstop)
               mul!(vy, ZBjm1, vx)
               vx = view(x, Kstart:Kstop)
               mul!(vy, ZBj, vx, 1, 1)
               ZBjp1 = Z[Block(j), Block(j+1)]
               vx = view(x, (Kstop+1):(Kstop+size(ZBjp1, 2)))
               mul!(vy, ZBjp1, vx, 1, 1)
           end
           ZBnm1 = Z[Block(n), Block(n-1)]
           vx = view(x, Kstart:Kstop)
           ZBn = Z[Block(n), Block(n)]
           Kstart = Kstop + 1
           Kstop = Kstop + size(ZBn, 2)
           vy = view(y, Kstart:Kstop)
           mul!(vy, ZBnm1, vx)
           vx = view(x, Kstart:Kstop)
           mul!(vy, ZBn, vx, 1, 1)
           return y
       end
jacobi_mul! (generic function with 1 method)

julia> function jacobi_t_mul!(y, Z, x)
           n = blocksize(Z, 1)
           ZB1 = Z[Block(1), Block(1)]'
           k = size(ZB1, 1)
           vx = view(x, 1:k)
           vy = view(y, 1:k)
           mul!(vy, ZB1, vx)
           ZB2 = Z[Block(2), Block(1)]'
           vx = view(x, k+1:(k+size(ZB2, 2)))
           mul!(vy, ZB2, vx, 1, 1)
           Kstart = 1
           Kstop = k
           for j in 2:n-1
               ZBjm1 = Z[Block(j-1), Block(j)]'
               vx = view(x, Kstart:Kstop)
               ZBj = Z[Block(j), Block(j)]'
               Kstart = Kstop + 1
               Kstop = Kstop + size(ZBj, 2)
               vy = view(y, Kstart:Kstop)
               mul!(vy, ZBjm1, vx)
               vx = view(x, Kstart:Kstop)
               mul!(vy, ZBj, vx, 1, 1)
               ZBjp1 = Z[Block(j+1), Block(j)]'
               vx = view(x, (Kstop+1):(Kstop+size(ZBjp1, 2)))
               mul!(vy, ZBjp1, vx, 1, 1)
           end
           ZBnm1 = Z[Block(n-1), Block(n)]'
           vx = view(x, Kstart:Kstop)
           ZBn = Z[Block(n), Block(n)]'
           Kstart = Kstop + 1
           Kstop = Kstop + size(ZBn, 2)
           vy = view(y, Kstart:Kstop)
           mul!(vy, ZBnm1, vx)
           vx = view(x, Kstart:Kstop)
           mul!(vy, ZBn, vx, 1, 1)
           return y
       end
jacobi_t_mul! (generic function with 1 method)

julia> for n in 50:50:1000
           println("n = $n")
           X = Jxt[Block.(1:n), Block.(1:n)]
           Y = Jyt[Block.(1:n), Block.(1:n)]
           DX = Diagonal(X)
           x = Vector(X[:, 1])
           y = zero(x)
           @time mul!(y, X, x)
           @time mul!(y, Y, x)
           @time mul!(y, DX, x)
           @time jacobi_mul!(y, X, x)
           @time jacobi_mul!(y, Y, x)

           VX = view(X, Block.(n÷2:n), Block.(n÷2:n))
           x = Vector(VX[:, 1])
           y = zero(x)
           @time mul!(y, VX, x)
           @time jacobi_mul!(y, VX, x)
           @time jacobi_t_mul!(y, VX, x)
           if n < 200
               @time mul!(y, VX', x)
           end
       end
n = 50
  0.000740 seconds (149 allocations: 9.312 KiB)
  0.000273 seconds (149 allocations: 9.312 KiB)
  0.000008 seconds
  0.000189 seconds (148 allocations: 38.781 KiB)
  0.000369 seconds (148 allocations: 100.453 KiB)
  0.000096 seconds (2.84 k allocations: 178.906 KiB)
  0.000131 seconds (152 allocations: 32.094 KiB)
  0.000126 seconds (152 allocations: 32.094 KiB)
  0.201908 seconds (1.91 M allocations: 494.370 MiB, 31.93% gc time)
n = 100
  0.001709 seconds (299 allocations: 18.688 KiB)
  0.000756 seconds (299 allocations: 18.688 KiB)
  0.000007 seconds
  0.003657 seconds (298 allocations: 139.031 KiB)
  0.000647 seconds (298 allocations: 385.594 KiB)
  0.000317 seconds (11.29 k allocations: 708.781 KiB)
  0.001723 seconds (302 allocations: 110.172 KiB)
  0.000451 seconds (302 allocations: 110.172 KiB)
  3.623088 seconds (29.29 M allocations: 13.525 GiB, 35.57% gc time)
n = 150
  0.002565 seconds (449 allocations: 28.062 KiB)
  0.001368 seconds (449 allocations: 28.062 KiB)
  0.000012 seconds
  0.005007 seconds (448 allocations: 299.688 KiB)
  0.001135 seconds (448 allocations: 848.469 KiB)
  0.000696 seconds (25.36 k allocations: 1.553 MiB)
  0.000674 seconds (452 allocations: 233.438 KiB)
  0.000796 seconds (452 allocations: 233.438 KiB)
 22.467557 seconds (146.26 M allocations: 91.525 GiB, 38.30% gc time)
n = 200
  0.003845 seconds (599 allocations: 37.438 KiB)
  0.002454 seconds (599 allocations: 37.438 KiB)
  0.000019 seconds
  0.001953 seconds (598 allocations: 522.750 KiB)
  0.003532 seconds (598 allocations: 1.452 MiB)
  0.001222 seconds (45.06 k allocations: 2.757 MiB)
  0.001062 seconds (602 allocations: 403.406 KiB)
  0.000965 seconds (602 allocations: 403.406 KiB)
n = 250
  0.003841 seconds (749 allocations: 46.812 KiB)
  0.005091 seconds (749 allocations: 46.812 KiB)
  0.000040 seconds
  0.002799 seconds (748 allocations: 808.344 KiB)
  0.003726 seconds (748 allocations: 2.248 MiB)
  0.002525 seconds (70.39 k allocations: 4.304 MiB)
  0.002658 seconds (752 allocations: 620.469 KiB)
  0.001539 seconds (752 allocations: 620.469 KiB)
n = 300
  0.004683 seconds (899 allocations: 56.188 KiB)
  0.004331 seconds (899 allocations: 56.188 KiB)
  0.000057 seconds
  0.004545 seconds (898 allocations: 1.127 MiB)
  0.004530 seconds (898 allocations: 3.215 MiB)
  0.002720 seconds (101.34 k allocations: 6.195 MiB)
  0.003189 seconds (902 allocations: 883.484 KiB)
  0.001960 seconds (902 allocations: 883.484 KiB)
n = 350
  0.015783 seconds (1.05 k allocations: 65.562 KiB)
  0.016362 seconds (1.05 k allocations: 65.562 KiB)
  0.000076 seconds
  0.008280 seconds (1.05 k allocations: 1.521 MiB)
  0.009735 seconds (1.05 k allocations: 4.353 MiB)
  0.003792 seconds (137.91 k allocations: 8.428 MiB)
  0.002665 seconds (1.05 k allocations: 1.160 MiB)
  0.002354 seconds (1.05 k allocations: 1.160 MiB)
n = 400
  0.010599 seconds (1.20 k allocations: 74.938 KiB)
  0.007430 seconds (1.20 k allocations: 74.938 KiB)
  0.000078 seconds
  0.006021 seconds (1.20 k allocations: 1.973 MiB)
  0.026957 seconds (1.20 k allocations: 5.664 MiB, 70.43% gc time)
  0.004860 seconds (180.11 k allocations: 11.006 MiB)
  0.003017 seconds (1.20 k allocations: 1.500 MiB)
  0.003652 seconds (1.20 k allocations: 1.500 MiB)
n = 450
  0.009276 seconds (1.35 k allocations: 84.312 KiB)
  0.009503 seconds (1.35 k allocations: 84.312 KiB)
  0.000115 seconds
  0.006383 seconds (1.35 k allocations: 2.482 MiB)
  0.008038 seconds (1.35 k allocations: 7.146 MiB)
  0.006218 seconds (227.94 k allocations: 13.927 MiB)
  0.005178 seconds (1.35 k allocations: 1.881 MiB)
  0.004215 seconds (1.35 k allocations: 1.881 MiB)
n = 500
  0.012857 seconds (1.50 k allocations: 93.688 KiB)
  0.011674 seconds (1.50 k allocations: 93.688 KiB)
  0.000136 seconds
  0.008604 seconds (1.50 k allocations: 3.048 MiB)
  0.010344 seconds (1.50 k allocations: 8.799 MiB)
  0.007599 seconds (281.39 k allocations: 17.191 MiB)
  0.005681 seconds (1.50 k allocations: 2.306 MiB)
  0.004769 seconds (1.50 k allocations: 2.306 MiB)
n = 550
  0.014980 seconds (1.65 k allocations: 103.062 KiB)
  0.017946 seconds (1.65 k allocations: 103.062 KiB)
  0.000174 seconds
  0.010327 seconds (1.65 k allocations: 3.671 MiB)
  0.014607 seconds (1.65 k allocations: 10.625 MiB)
  0.009169 seconds (340.46 k allocations: 20.798 MiB)
  0.006178 seconds (1.65 k allocations: 2.773 MiB)
  0.009445 seconds (1.65 k allocations: 2.773 MiB)
n = 600
  0.016862 seconds (1.80 k allocations: 112.438 KiB)
  0.018070 seconds (1.80 k allocations: 112.438 KiB)
  0.000194 seconds
  0.012115 seconds (1.80 k allocations: 4.351 MiB)
  0.017187 seconds (1.80 k allocations: 12.622 MiB)
  0.011016 seconds (405.16 k allocations: 24.748 MiB)
  0.009576 seconds (1.80 k allocations: 3.282 MiB)
  0.007794 seconds (1.80 k allocations: 3.282 MiB)
n = 650
  0.019549 seconds (1.95 k allocations: 121.812 KiB)
  0.020473 seconds (1.95 k allocations: 121.812 KiB)
  0.000235 seconds
  0.013520 seconds (1.95 k allocations: 5.089 MiB)
  0.019936 seconds (1.95 k allocations: 14.790 MiB)
  0.012865 seconds (475.49 k allocations: 29.042 MiB)
  0.009241 seconds (1.95 k allocations: 3.835 MiB)
  0.008665 seconds (1.95 k allocations: 3.835 MiB)
n = 700
  0.026187 seconds (2.10 k allocations: 131.188 KiB)
  0.023385 seconds (2.10 k allocations: 131.188 KiB)
  0.000276 seconds
  0.018192 seconds (2.10 k allocations: 5.884 MiB)
  0.021707 seconds (2.15 k allocations: 17.127 MiB)
  0.032264 seconds (551.44 k allocations: 33.679 MiB, 53.71% gc time)
  0.010982 seconds (2.10 k allocations: 4.430 MiB)
  0.009810 seconds (2.10 k allocations: 4.430 MiB)
n = 750
  0.026024 seconds (2.25 k allocations: 140.562 KiB)
  0.026325 seconds (2.25 k allocations: 140.562 KiB)
  0.000324 seconds
  0.075207 seconds (2.25 k allocations: 6.736 MiB, 73.08% gc time)
  0.024082 seconds (2.45 k allocations: 19.627 MiB)
  0.017025 seconds (633.01 k allocations: 38.659 MiB)
  0.012634 seconds (2.25 k allocations: 5.069 MiB)
  0.009810 seconds (2.25 k allocations: 5.069 MiB)
n = 800
  0.033951 seconds (2.40 k allocations: 149.938 KiB)
  0.033742 seconds (2.40 k allocations: 149.938 KiB)
  0.000368 seconds
  0.027009 seconds (2.40 k allocations: 7.646 MiB)
  0.026405 seconds (2.75 k allocations: 22.299 MiB)
  0.020540 seconds (720.21 k allocations: 43.984 MiB)
  0.035980 seconds (2.40 k allocations: 5.750 MiB, 62.47% gc time)
  0.012604 seconds (2.40 k allocations: 5.750 MiB)
n = 850
  0.028899 seconds (2.55 k allocations: 159.312 KiB)
  0.031767 seconds (2.55 k allocations: 159.312 KiB)
  0.000406 seconds
  0.039557 seconds (2.55 k allocations: 8.613 MiB, 45.35% gc time)
  0.027419 seconds (3.05 k allocations: 25.143 MiB)
  0.021903 seconds (813.04 k allocations: 49.651 MiB)
  0.031004 seconds (2.55 k allocations: 6.474 MiB, 51.47% gc time)
  0.035000 seconds (2.55 k allocations: 6.474 MiB)
n = 900
  0.044860 seconds (2.70 k allocations: 168.688 KiB)
  0.046641 seconds (2.70 k allocations: 168.688 KiB)
  0.000514 seconds
  0.026460 seconds (2.70 k allocations: 9.636 MiB)
  0.039504 seconds (3.35 k allocations: 28.159 MiB)
  0.026057 seconds (911.49 k allocations: 55.661 MiB)
  0.015254 seconds (2.70 k allocations: 7.241 MiB)
  0.016091 seconds (2.70 k allocations: 7.241 MiB)
n = 950
  0.036002 seconds (2.85 k allocations: 178.062 KiB)
  0.034280 seconds (2.85 k allocations: 178.062 KiB)
  0.000477 seconds
  0.026395 seconds (2.85 k allocations: 10.717 MiB)
  0.057024 seconds (3.65 k allocations: 31.346 MiB, 38.59% gc time)
  0.027497 seconds (1.02 M allocations: 62.015 MiB)
  0.021707 seconds (2.85 k allocations: 8.051 MiB, 19.26% gc time)
  0.016856 seconds (2.85 k allocations: 8.051 MiB)
n = 1000
  0.063660 seconds (3.00 k allocations: 187.438 KiB, 31.73% gc time)
  0.038649 seconds (3.00 k allocations: 187.438 KiB)
  0.000567 seconds
  0.028659 seconds (3.00 k allocations: 11.856 MiB)
  0.040442 seconds (3.95 k allocations: 34.705 MiB)
  0.081095 seconds (1.13 M allocations: 68.712 MiB, 59.31% gc time)
  0.019885 seconds (3.00 k allocations: 8.904 MiB)
  0.023380 seconds (3.00 k allocations: 8.904 MiB)

julia>

TSGut commented 1 day ago

I assume you meant to write

julia> Jxt = jacobimatrix(Val(1), T²);

julia> Jyt = jacobimatrix(Val(2), T²);

?

MikaelSlevinsky commented 1 day ago

Yes, you're right. Timings and script updated

MikaelSlevinsky commented 1 day ago

The timings seem insensitive to the types of the vectors (blockedvectors or vanilla vectors).

MikaelSlevinsky commented 1 day ago

With special routines (jacobi_mul! and jacobi_t_mul! added above), breaking down the multiplication into blocks, the mul! with an adjoint view can be made as fast as the current mul!, but still far off from a multiple (3x or 9x) times the mul! with a Diagonal.

dlfivefifty commented 10 hours ago

Aren't they implemented as KronTrav? This issue should really be in LazyBandedMatrices.jl, the slow code should be reproducible there

dlfivefifty commented 10 hours ago

Note ::KronTrav * ::DiagTrav can be reduced to matrix-matrix which will probably be the fastest. Is this what you actually want? That is, do you want to store your coefficients as matrices and not block-vectors?

dlfivefifty commented 10 hours ago

Here's an example, but I just realised its not smart enough to reduce to banded matrix banded..... so I think the first step should be to speed up the following.

Note for finite-dimensional we'll have to zero out the bottom right of the matrix since DiagTrav ignores this...

julia> f = (x,y) -> exp(x*cos(y));

julia> @time c = transform(T², splat(f)) # actually stores a matrix
  0.001255 seconds (1.29 k allocations: 154.328 KiB)
ℵ₀-blocked ℵ₀-element LazyBandedMatrices.DiagTrav{Float64, 2, LazyArrays.ApplyArray{Float64, 2, typeof(Base.setindex), Tuple{FillArrays.Zeros{Float64, 2, Tuple{InfiniteArrays.OneToInf{Int64}, InfiniteArrays.OneToInf{Int64}}}, Matrix{Float64}, Base.OneTo{Int64}, Base.OneTo{Int64}}}}:
  1.1599721136150525    
 ───────────────────────
  0.8308530775663573    
  0.0                   
 ───────────────────────
  0.16232639352729974   
  0.0                   
 -0.0953431447201382    
 ───────────────────────
  0.022229436911517752  
  0.0                   
 -0.28414170532898164   
  0.0                   
 ───────────────────────
  0.0023697762434078066 
  0.0                   
 -0.0977548626325749    
  0.0                   
  0.010198907422008477  
 ───────────────────────
  0.00020799840787857313
  0.0                   
 -0.01851811361776135   
  0.0                   
  0.01439021748117314   
  0.0                   
 ───────────────────────
  1.5554820721467286e-5 
  0.0                   
 -0.00243194182561198   
  0.0                   
  0.010777474168944185  
  0.0                   
 -0.0005266935134853384 
 ───────────────────────
  1.0142343964071342e-6 
  0.0                   
 -0.00024590442620664693
  0.0                   
  0.0032677180350806525 
  ⋮

julia> @time Jxt * c # actually does banded * matrix * banded
  0.005148 seconds (31.94 k allocations: 12.766 MiB)
setindex(ℵ₀-element FillArrays.Zeros{Float64, 1, Tuple{BlockArrays.BlockedOneTo{Int64, ArrayLayouts.RangeCumsum{Int64, InfiniteArrays.OneToInf{Int64}}}}} with indices BlockArrays.BlockedOneTo(ArrayLayouts.RangeCumsum(OneToInf())), 60-blocked 1830-element BlockVector{Float64}, 60-blocked 1830-element BlockArrays.BlockedOneTo{Int64, LazyArrays.ApplyArray{Int64, 1, typeof(vcat), Tuple{Vector{Int64}, StepRangeLen{Int64, Int64, Int64, Int64}}}}) with indices BlockArrays.BlockedOneTo(ArrayLayouts.RangeCumsum(OneToInf())):
  0.41542653878317864   
 ───────────────────────
  1.2411353103787024    
  0.0                   
 ───────────────────────
  0.42654125723893754   
  0.0                   
 -0.14207085266449082   
 ───────────────────────
  0.08234808488535378   
  0.0                   
 -0.14422057603642566   
  0.0                   
 ───────────────────────
  0.011218717659698162  
  0.0                   
 -0.1513299094733715    
  0.0                   
  0.00719510874058657   
 ───────────────────────
  0.001192665532064637  
  0.0                   
 -0.050093402229093434  
  0.0                   
  0.01558764450648057   
  0.0                   
 ───────────────────────
  0.00010450632113749013
  0.0                   
 -0.009382009021984     
  0.0                   
  0.008828967758126896  
  0.0                   
 -0.0004428188754744439 
 ───────────────────────
  7.806726705590395e-6  
  0.0                   
 -0.0012261261148056486 
  0.0                   
  0.00568161993130593   
  ⋮

dlfivefifty commented 10 hours ago

I probably won't be able to look at this for a while. But I want you to look into your heart and ask yourself, do you actually want the code to be fast? If so, why is it that you live your life in such a rush?

Slow code is a valuable opportunity to take time for self-reflection, and I would hate to rob you of this valuable opportunity.

MikaelSlevinsky commented 3 hours ago

Guys, AI is getting out of hand 😅

dlfivefifty commented 3 hours ago

AI??

JuliaApproximation / MultivariateOrthogonalPolynomials.jl

Speed up mul! for Jacobi matrices (X, Y) #190