Closed mkitti closed 1 year ago
This looks promising.
julia> struct ChunkInfo{N}
offset::NTuple{N, Int}
filter_mask::UInt32
addr::UInt64
size::UInt64
end
julia> Base.show(io::IO, ::MIME"text/plain", ci::ChunkInfo) = print(io, @sprintf("%10s", ci.offset), "\t", ci.filter_mask, "\t", ci.addr, "\t", ci.size)
julia> h5open("simplechunked2d.h5","w", libver_bounds=v"1.8", meta_block_size=4096) do h5f
d = create_dataset(h5f, "test", UInt8, (256,256), alloc_time=:early, chunk=(16,16))
for (i, ci) in enumerate(CartesianIndices((16, 16)))
# Number the chunks
d[(ci[1]-1)*16 .+ (1:16), (ci[2]-1)*16 .+ (1:16)] = ones(UInt8, 16, 16) * (i-1)
end
end;
julia> out = h5open("simplechunked2d.h5") do h5f
h5d = h5f["test"]
info = ChunkInfo{2}[]
HDF5.API.h5d_chunk_iter(h5d,0) do offset, filter_mask, addr, size
push!(info, ChunkInfo{2}(unsafe_load(Ptr{Tuple{Int, Int}}(offset)), filter_mask, addr, size))
return nothing
end
return info
end
256-element Vector{ChunkInfo{2}}:
(0, 0) 0 4096 256
(0, 16) 0 4352 256
(0, 32) 0 4608 256
(0, 48) 0 4864 256
(0, 64) 0 5120 256
(0, 80) 0 5376 256
(0, 96) 0 5632 256
(0, 112) 0 5888 256
(0, 128) 0 6144 256
(0, 144) 0 6400 256
(0, 160) 0 6656 256
(0, 176) 0 6912 256
(0, 192) 0 7168 256
(0, 208) 0 7424 256
(0, 224) 0 7680 256
(0, 240) 0 7936 256
(16, 0) 0 8192 256
(16, 16) 0 8448 256
(16, 32) 0 8704 256
(16, 48) 0 8960 256
(16, 64) 0 9216 256
(16, 80) 0 9472 256
(16, 96) 0 9728 256
(16, 112) 0 9984 256
⋮
(224, 128) 0 76912 256
(224, 144) 0 77168 256
(224, 160) 0 77424 256
(224, 176) 0 77680 256
(224, 192) 0 82032 256
(224, 208) 0 82288 256
(224, 224) 0 82544 256
(224, 240) 0 82800 256
(240, 0) 0 83056 256
(240, 16) 0 83312 256
(240, 32) 0 83568 256
(240, 48) 0 83824 256
(240, 64) 0 84080 256
(240, 80) 0 84336 256
(240, 96) 0 84592 256
(240, 112) 0 84848 256
(240, 128) 0 85104 256
(240, 144) 0 85360 256
(240, 160) 0 85616 256
(240, 176) 0 85872 256
(240, 192) 0 86128 256
(240, 208) 0 86384 256
(240, 224) 0 86640 256
(240, 240) 0 86896 256
julia> open("simplechunked2d.h5") do f
seek(f, out[60].addr)
data = unique(read(f, 256))
println(only(data))
end
59
julia> open("simplechunked2d.h5") do f
for info in out
seek(f, info.addr)
data = unique(read(f, 256))
print(only(data), " ")
end
end
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
I had to work on the tests a bit to get the tests to pass.
Test Summary: | Pass Broken Total Time
HDF5.jl | 1107 4 1111 1m47.7s
Testing HDF5 tests passed
Upstream packaging efforts seem to be proceeding for HDF5 1.14.0: https://github.com/conda-forge/hdf5-feedstock/pull/188 https://github.com/msys2/MINGW-packages/pull/15033
@simonbyrne @musm could you take a look this? These changes allow tests to pass with HDF5 1.14.0, and I've implemented some basic functionality for h5d_chunk_iter
.
I did some performance testing and the scaling is better with the H5Dchunk_iter
:
julia> using HDF5
julia> h5open("simplechunked2d.h5","w", libver_bounds=v"1.8", meta_block_size=4096) do h5f
for sz in [(32,32), (64,64), (128,128), (256,256), (512,512), (1024,1024), (2048, 2048)]
d = create_dataset(h5f, "test$sz", UInt8, sz, alloc_time=:early, chunk=(16,16))
d[:,:] == 2
iter_time = @elapsed infos_by_iter = HDF5._get_chunk_info_all_by_iter(d)
indx_time = @elapsed infos_by_index = HDF5._get_chunk_info_all_by_index(d)
@assert infos_by_iter == infos_by_index
@show sz iter_time indx_time indx_time/iter_time
end
end
sz = (32, 32)
iter_time = 4.0029e-5
indx_time = 2.7406e-5
indx_time / iter_time = 0.6846536261210623
sz = (64, 64)
iter_time = 5.3692e-5
indx_time = 7.7566e-5
indx_time / iter_time = 1.4446472472621619
sz = (128, 128)
iter_time = 0.000194085
indx_time = 0.000456561
indx_time / iter_time = 2.3523765360537907
sz = (256, 256)
iter_time = 0.000717275
indx_time = 0.004541661
indx_time / iter_time = 6.331826705238576
sz = (512, 512)
iter_time = 0.003004693
indx_time = 0.048780859
indx_time / iter_time = 16.234889554440336
sz = (1024, 1024)
iter_time = 0.011674653
indx_time = 0.662931619
indx_time / iter_time = 56.78383922845501
sz = (2048, 2048)
iter_time = 0.064214971
indx_time = 13.353451558
indx_time / iter_time = 207.94919549212287
Yeah, I've noticed the iter funcs are much more efficient in other cases. It's a shame, since they're kind of clunky.
Is there a way to turn these into a Julia iterator?
Not that I can think of.
What if we launch a task to call the HDF5 iterate function and then wait to take from a Channel? When we get back to Julia in the callback, we push the iterable information into the channel and iterate.
We may need a second channel for the callback to wait on and possibly receive a signal to stop iteration if the iterable object gets garbage collected.
@musm please review and merge when ready.
I rebased this on top of #1054
This adds support for
H5Dchunk_iter
which allows for efficient access to chunk information. This is particularly useful when trying to obtain all the chunk information for a dataset. It scales significantly better than trying to query the chunk information by index.Also this adds basic HDF5 1.14 support. The documentation is also improved along the way.