JuliaIO / HDF5.jl

Save and load data in the HDF5 file format from Julia
https://juliaio.github.io/HDF5.jl
MIT License
386 stars 140 forks source link

Add initial support for H5Dchunk_iter #1031

Closed mkitti closed 1 year ago

mkitti commented 1 year ago

This adds support for H5Dchunk_iter which allows for efficient access to chunk information. This is particularly useful when trying to obtain all the chunk information for a dataset. It scales significantly better than trying to query the chunk information by index.

Also this adds basic HDF5 1.14 support. The documentation is also improved along the way.

mkitti commented 1 year ago

This looks promising.

julia> struct ChunkInfo{N}
           offset::NTuple{N, Int}
           filter_mask::UInt32
           addr::UInt64
           size::UInt64
       end

julia> Base.show(io::IO, ::MIME"text/plain", ci::ChunkInfo) = print(io, @sprintf("%10s", ci.offset), "\t", ci.filter_mask, "\t", ci.addr, "\t", ci.size)

julia> h5open("simplechunked2d.h5","w", libver_bounds=v"1.8", meta_block_size=4096) do h5f
           d = create_dataset(h5f, "test", UInt8, (256,256), alloc_time=:early, chunk=(16,16))
           for (i, ci) in enumerate(CartesianIndices((16, 16)))
               # Number the chunks
               d[(ci[1]-1)*16 .+ (1:16), (ci[2]-1)*16 .+ (1:16)] = ones(UInt8, 16, 16) * (i-1)
           end
       end;

julia> out = h5open("simplechunked2d.h5") do h5f
           h5d = h5f["test"]
           info = ChunkInfo{2}[]
           HDF5.API.h5d_chunk_iter(h5d,0) do offset, filter_mask, addr, size
               push!(info, ChunkInfo{2}(unsafe_load(Ptr{Tuple{Int, Int}}(offset)), filter_mask, addr, size))
               return nothing
           end
           return info
       end
256-element Vector{ChunkInfo{2}}:
     (0, 0) 0   4096    256
    (0, 16) 0   4352    256
    (0, 32) 0   4608    256
    (0, 48) 0   4864    256
    (0, 64) 0   5120    256
    (0, 80) 0   5376    256
    (0, 96) 0   5632    256
   (0, 112) 0   5888    256
   (0, 128) 0   6144    256
   (0, 144) 0   6400    256
   (0, 160) 0   6656    256
   (0, 176) 0   6912    256
   (0, 192) 0   7168    256
   (0, 208) 0   7424    256
   (0, 224) 0   7680    256
   (0, 240) 0   7936    256
    (16, 0) 0   8192    256
   (16, 16) 0   8448    256
   (16, 32) 0   8704    256
   (16, 48) 0   8960    256
   (16, 64) 0   9216    256
   (16, 80) 0   9472    256
   (16, 96) 0   9728    256
  (16, 112) 0   9984    256
 ⋮
 (224, 128) 0   76912   256
 (224, 144) 0   77168   256
 (224, 160) 0   77424   256
 (224, 176) 0   77680   256
 (224, 192) 0   82032   256
 (224, 208) 0   82288   256
 (224, 224) 0   82544   256
 (224, 240) 0   82800   256
   (240, 0) 0   83056   256
  (240, 16) 0   83312   256
  (240, 32) 0   83568   256
  (240, 48) 0   83824   256
  (240, 64) 0   84080   256
  (240, 80) 0   84336   256
  (240, 96) 0   84592   256
 (240, 112) 0   84848   256
 (240, 128) 0   85104   256
 (240, 144) 0   85360   256
 (240, 160) 0   85616   256
 (240, 176) 0   85872   256
 (240, 192) 0   86128   256
 (240, 208) 0   86384   256
 (240, 224) 0   86640   256
 (240, 240) 0   86896   256

julia> open("simplechunked2d.h5") do f
           seek(f, out[60].addr)
           data = unique(read(f, 256))
           println(only(data))
       end
59

julia> open("simplechunked2d.h5") do f
           for info in out
               seek(f, info.addr)
               data = unique(read(f, 256))
               print(only(data), " ")
           end
       end
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 
mkitti commented 1 year ago

I had to work on the tests a bit to get the tests to pass.

Test Summary: | Pass  Broken  Total     Time
HDF5.jl       | 1107       4   1111  1m47.7s
     Testing HDF5 tests passed 

Upstream packaging efforts seem to be proceeding for HDF5 1.14.0: https://github.com/conda-forge/hdf5-feedstock/pull/188 https://github.com/msys2/MINGW-packages/pull/15033

mkitti commented 1 year ago

@simonbyrne @musm could you take a look this? These changes allow tests to pass with HDF5 1.14.0, and I've implemented some basic functionality for h5d_chunk_iter.

mkitti commented 1 year ago

I did some performance testing and the scaling is better with the H5Dchunk_iter:

julia> using HDF5

julia> h5open("simplechunked2d.h5","w", libver_bounds=v"1.8", meta_block_size=4096) do h5f
           for sz in [(32,32), (64,64), (128,128), (256,256), (512,512), (1024,1024), (2048, 2048)]
               d = create_dataset(h5f, "test$sz", UInt8, sz, alloc_time=:early, chunk=(16,16))
               d[:,:] == 2
               iter_time = @elapsed infos_by_iter = HDF5._get_chunk_info_all_by_iter(d)
               indx_time = @elapsed infos_by_index = HDF5._get_chunk_info_all_by_index(d)
               @assert infos_by_iter == infos_by_index
               @show sz iter_time indx_time indx_time/iter_time
           end
       end
sz = (32, 32)
iter_time = 4.0029e-5
indx_time = 2.7406e-5
indx_time / iter_time = 0.6846536261210623
sz = (64, 64)
iter_time = 5.3692e-5
indx_time = 7.7566e-5
indx_time / iter_time = 1.4446472472621619
sz = (128, 128)
iter_time = 0.000194085
indx_time = 0.000456561
indx_time / iter_time = 2.3523765360537907
sz = (256, 256)
iter_time = 0.000717275
indx_time = 0.004541661
indx_time / iter_time = 6.331826705238576
sz = (512, 512)
iter_time = 0.003004693
indx_time = 0.048780859
indx_time / iter_time = 16.234889554440336
sz = (1024, 1024)
iter_time = 0.011674653
indx_time = 0.662931619
indx_time / iter_time = 56.78383922845501
sz = (2048, 2048)
iter_time = 0.064214971
indx_time = 13.353451558
indx_time / iter_time = 207.94919549212287
simonbyrne commented 1 year ago

Yeah, I've noticed the iter funcs are much more efficient in other cases. It's a shame, since they're kind of clunky.

mkitti commented 1 year ago

Is there a way to turn these into a Julia iterator?

simonbyrne commented 1 year ago

Not that I can think of.

mkitti commented 1 year ago

What if we launch a task to call the HDF5 iterate function and then wait to take from a Channel? When we get back to Julia in the callback, we push the iterable information into the channel and iterate.

We may need a second channel for the callback to wait on and possibly receive a signal to stop iteration if the iterable object gets garbage collected.

mkitti commented 1 year ago

@musm please review and merge when ready.

mkitti commented 1 year ago

I rebased this on top of #1054