JuliaML / MLUtils.jl

Utilities and abstractions for Machine Learning tasks
MIT License
107 stars 20 forks source link

DataLoader slowdown throughout iterations with parallel=true #148

Open jeremiedb opened 1 year ago

jeremiedb commented 1 year ago

DataLoader performance degrades through iterations when parallel=true.

Following is a MWE illustrating the issue:

using Images
using StatsBase: sample, shuffle
using DataAugmentation
using Flux
using TestImages
import Base: length, getindex
using BenchmarkTools

const im_size = (224, 224)
imgs = rand(["chelsea", "coffee"], 1000)

struct ImageContainer{T<:Vector}
    img::T
end

length(data::ImageContainer) = length(data.img)
tfm = DataAugmentation.compose(ScaleKeepAspect(im_size))

function getindex(data::ImageContainer, idx::Int)
    path = data.img[idx]
    _img = testimage(path)
    _img = apply(tfm, Image(_img))
    img = collect(channelview(float32.(itemdata(_img))))
    return img
end

data = ImageContainer(imgs)
deval =
    Flux.DataLoader(data, batchsize = 32, parallel = true, collate = true, partial = true)

function data_loop(data)
    count = 0
    for x in data
        count += last(size(x))
    end
    return nothing
end

for i = 1:20
    @time data_loop(deval)
end

If lauched with julia --project=@. --threads=8 .\data-test.jl:

9.125682 seconds (23.69 M allocations: 5.049 GiB, 7.29% gc time, 73.35% compilation time: 10% of which was recompilation)
  2.291844 seconds (1.80 M allocations: 3.907 GiB, 36.56% gc time)
  1.763683 seconds (1.80 M allocations: 3.907 GiB, 17.21% gc time)
  1.754396 seconds (1.80 M allocations: 3.907 GiB, 16.14% gc time)
  1.856838 seconds (1.80 M allocations: 3.907 GiB, 19.07% gc time)
  1.771068 seconds (1.80 M allocations: 3.907 GiB, 13.99% gc time)
  1.916222 seconds (1.80 M allocations: 3.907 GiB, 18.39% gc time)
  1.957732 seconds (1.80 M allocations: 3.907 GiB, 13.32% gc time)
  2.336840 seconds (1.80 M allocations: 3.907 GiB, 14.05% gc time)
  2.433143 seconds (1.80 M allocations: 3.907 GiB, 15.70% gc time)
  2.524151 seconds (1.80 M allocations: 3.907 GiB, 12.57% gc time)
  2.289293 seconds (1.80 M allocations: 3.907 GiB, 10.86% gc time)
  2.628618 seconds (1.80 M allocations: 3.907 GiB, 14.58% gc time)
  2.495031 seconds (1.80 M allocations: 3.907 GiB, 11.85% gc time)
  2.509507 seconds (1.80 M allocations: 3.907 GiB, 13.24% gc time)
  2.513830 seconds (1.80 M allocations: 3.907 GiB, 11.68% gc time)
  2.531557 seconds (1.80 M allocations: 3.907 GiB, 13.33% gc time)
  2.514478 seconds (1.80 M allocations: 3.907 GiB, 10.86% gc time)
  2.724194 seconds (1.80 M allocations: 3.907 GiB, 12.45% gc time)
  2.689008 seconds (1.80 M allocations: 3.907 GiB, 13.42% gc time)
  2.399 s (1801661 allocations: 3.91 GiB)
  2.383 s (1801654 allocations: 3.91 GiB)
  2.513 s (1801675 allocations: 3.91 GiB)

If parallel=false, then performance remains stable throughout the iterations:

 13.533416 seconds (19.77 M allocations: 4.849 GiB, 4.09% gc time, 39.94% compilation time: 12% of which was recompilation)
  7.463501 seconds (1.80 M allocations: 3.910 GiB, 4.21% gc time)
  7.458656 seconds (1.80 M allocations: 3.910 GiB, 4.08% gc time)
  7.461299 seconds (1.80 M allocations: 3.910 GiB, 4.06% gc time)
  7.568510 seconds (1.80 M allocations: 3.910 GiB, 4.06% gc time)
  7.491400 seconds (1.80 M allocations: 3.910 GiB, 4.02% gc time)
  7.683299 seconds (1.80 M allocations: 3.910 GiB, 3.97% gc time)
  7.523333 seconds (1.80 M allocations: 3.910 GiB, 4.21% gc time)
  7.472339 seconds (1.80 M allocations: 3.910 GiB, 4.00% gc time)
  7.545891 seconds (1.80 M allocations: 3.910 GiB, 4.04% gc time)
jeremiedb commented 1 year ago

Also, deterioration appears more accute with greater number of threads:

16 Threads (on a 16 threads AMD ryzen):

  9.203387 seconds (23.63 M allocations: 5.051 GiB, 6.94% gc time, 77.60% compilation time: 11% of which was recompilation)
  1.956289 seconds (1.80 M allocations: 3.913 GiB, 41.04% gc time)
  2.233191 seconds (1.80 M allocations: 3.913 GiB, 22.36% gc time)
  2.827008 seconds (1.80 M allocations: 3.913 GiB, 11.75% gc time)
  3.290497 seconds (1.80 M allocations: 3.913 GiB, 9.42% gc time)
  3.628016 seconds (1.80 M allocations: 3.913 GiB, 12.39% gc time)
  3.841822 seconds (1.80 M allocations: 3.913 GiB, 12.04% gc time)
  3.442708 seconds (1.80 M allocations: 3.913 GiB, 7.52% gc time)
  3.782556 seconds (1.80 M allocations: 3.913 GiB, 12.31% gc time)
  3.887522 seconds (1.80 M allocations: 3.913 GiB, 10.90% gc time)
  3.876421 seconds (1.80 M allocations: 3.913 GiB, 9.79% gc time)
  3.781522 seconds (1.80 M allocations: 3.913 GiB, 9.09% gc time)
  3.831761 seconds (1.80 M allocations: 3.913 GiB, 8.49% gc time)
  3.959064 seconds (1.80 M allocations: 3.913 GiB, 12.16% gc time)
  3.694886 seconds (1.80 M allocations: 3.913 GiB, 6.88% gc time)
  4.554031 seconds (1.80 M allocations: 3.913 GiB, 16.07% gc time)
  4.442375 seconds (1.80 M allocations: 3.913 GiB, 9.45% gc time)
  4.486033 seconds (1.80 M allocations: 3.913 GiB, 9.17% gc time)
  4.318760 seconds (1.80 M allocations: 3.913 GiB, 6.02% gc time)
  5.050747 seconds (1.80 M allocations: 3.913 GiB, 15.19% gc time)

This was run with current latest releases:

  [88a5189c] DataAugmentation v0.2.11
  [587475ba] Flux v0.13.13
  [916415d5] Images v0.25.2
  [dbeba491] Metalhead v0.7.4
  [9a3f8284] Random
  [10745b16] Statistics

MLUtils = 0.4.1

nikopj commented 1 year ago

I am also observing this.