JuliaIO / HDF5.jl

Save and load data in the HDF5 file format from Julia
https://juliaio.github.io/HDF5.jl
MIT License
386 stars 140 forks source link

Create a channel to iterate attributes #1045

Closed mkitti closed 1 year ago

mkitti commented 1 year ago

This is a proof of concept to show how we can turn the HDF5 iterate functions into Channel iterators. Below we produce a Channel that is passed to map. The Channel is closed at the end of the do block. If no function or do block is provided, the Channel is returned, and the user is responsible for closing the channel.

julia> h5open("test.h5","w", libver_bounds=v"1.8", meta_block_size=4096) do h5f
           attrs(h5f)["first"] = "Mark"
           attrs(h5f)["last"] = "Kittisopikul"
       end;

julia> h5open("test.h5") do h5f
           HDF5.API.h5a_iterate(Channel, h5f, HDF5.API.H5_INDEX_NAME, HDF5.API.H5_ITER_INC) do ch
               map(ch) do (loc, name, info)
                   name = unsafe_string(name)
                   name => attrs(h5f)[name]
               end
           end
       end
2-element Vector{Pair{String, String}}:
 "first" => "Mark"
  "last" => "Kittisopikul"
simonbyrne commented 1 year ago

What's the benefit of using a Channel?

mkitti commented 1 year ago

What's the benefit of using a Channel?

The primary benefit is that the Channel is a normal Julia iterator. I can use it in a for loop, map, or an array comprehension. I can also pass into an arbitrary function or compose it with the utilities in Iterators.

While I am pretty sure that the Channel approach involves more overhead, I think it also makes this a lot easier to use in Julia.

simonbyrne commented 1 year ago

Ah, I see now, that's clever.

I'm not sure how safe this is: while you iterate, you will basically end up with the library being blocked on a call to the iterate function until it completes or is cleaned up?

simonbyrne commented 1 year ago

Out of curiosity, how does the performance compare with h5a_open_by_idx?

mkitti commented 1 year ago

Out of curiosity, how does the performance compare with h5a_open_by_idx?

julia> h5open("test.h5","w", libver_bounds=v"1.8", meta_block_size=4096) do h5f
            A = attrs(h5f)
            for i in 1:2^10
               A[string(uuid4())] = i
            end
       end;

julia> byidx = @time h5open("test.h5") do h5f
           map(1:2^10) do i
               ha = HDF5.Attribute(HDF5.API.h5a_open_by_idx(h5f, ".", HDF5.API.H5_INDEX_NAME, HDF5.API.H5_ITER_INC, i-1, HDF5.API.H5P_DEFAULT, HDF5.API.H5P_DEFAULT), h5f)
               HDF5.API.h5a_get_name(ha) => read(ha)
           end
       end;
  2.037233 seconds (56.90 k allocations: 3.597 MiB, 6.31% compilation time)

julia> bych = @time h5open("test.h5") do h5f
                  HDF5.API.h5a_iterate(Channel, h5f, HDF5.API.H5_INDEX_NAME, HDF5.API.H5_ITER_INC) do ch
                      map(ch) do (loc, name, info)
                          name = unsafe_string(name)
                          name => attrs(h5f)[name]
                      end
                  end
              end;
  0.147486 seconds (66.65 k allocations: 3.731 MiB, 79.10% compilation time)

julia> byidx == bych
true

julia> byidx
1024-element Vector{Pair{String, Int64}}:
 "00429876-14db-4ecf-8e45-76c8537d9316" => 608
 "006a13fb-4762-47e4-bf99-12f661011cc0" => 719
 "0079770f-b30c-4707-a28e-d04ba344b5b4" => 552
 "0116262a-b0b1-4eea-9f7b-b291c5960746" => 524
 "01c01635-c951-45d8-b032-e36d01fafaa6" => 891
 "01e99868-9082-434b-b3d9-b1c36a4f0026" => 680
 "0293de16-26eb-40aa-be46-7d80fc3a9290" => 344
 "02a84c32-0bc3-4715-8b4c-d7d6bec038bb" => 21
 "02cdf81d-5fa2-459c-8452-16c611b5133c" => 751
 "02ee0412-a02a-414a-ace6-7749471876d7" => 240
 "034059c5-aeb2-4029-b744-7e32e164696e" => 600
 "03bea372-90b1-43f3-a766-080a81e6f21a" => 1017
 "03c6cdb8-9a22-4b09-b64d-390d6bd02590" => 645
 "03d57036-4e47-45cb-9935-731e41f4a83b" => 698
 "03f40683-7e9e-42f7-a893-bae2f3934cce" => 803
 "049adf84-0e71-4b92-9bb5-56b80e2d3894" => 607
 "04e50be8-650d-4861-aa2b-1f6b35bd0bad" => 81
 "0509abc5-ee92-48a9-a63a-c8d5bf676ec2" => 513
 "05164573-37e6-4cc3-8732-b45cdf16820b" => 109
 "053ae32c-722b-40af-83bd-0b693ddb20b1" => 954
 "0555a61e-2db3-4b43-a04d-b9679988f78e" => 987
 "056d0f81-6c50-4a8e-ae3c-76f06b2fe8c8" => 810
 "056d4326-5cd7-4d40-9b6e-3499b0479977" => 628
 "056f8534-d54e-44ec-af20-d61375e36641" => 688
                                        ⋮
 "fc4224fd-569a-4943-8a8c-17961f65af40" => 633
 "fc50dc03-5e24-4216-a76c-00d5822a10c9" => 278
 "fc73bdcd-c1d2-43f4-ba56-e06908e9f3b4" => 478
 "fcacd4ae-4d05-4476-9011-63478500fed8" => 561
 "fcb475c1-d217-409d-9b43-2bd68da54998" => 456
 "fcbb87ef-a4c7-4c3a-9ce3-b5e3a14a8db6" => 811
 "fcbf5622-4672-412d-adf2-eebe607ff840" => 713
 "fd94c487-f428-44f0-a3aa-bfa4a2aeae4e" => 982
 "fd9b4077-5d05-4d00-b9db-621252e0187a" => 63
 "fdba4a9f-cb9d-467a-a625-89a2453c0c0a" => 690
 "fdc274aa-1cd0-4e7e-b080-d5453db66230" => 625
 "fdccc67a-484a-48f5-b263-0a8837953783" => 112
 "fdd83ff7-3b38-41fb-864d-01f38b726250" => 105
 "fde49af5-01e4-4c78-bbe0-0f5625d30cec" => 1018
 "fdf9e7e8-07b9-43ad-8a98-887171a4dca1" => 657
 "fe15d296-42c4-4e32-b62e-fa8e1c897183" => 132
 "fe9eba00-bbd4-41f9-b1a3-dfbe101ab16f" => 366
 "fea03186-9a5e-47b4-970f-5714d0605a4e" => 878
 "fea98de3-a1ee-47c8-a7de-6ab7680d4258" => 683
 "fec0f74a-a93f-4181-93d5-02f559605dcd" => 244
 "ff03ff72-495d-4874-aecc-2dc58c0fee02" => 283
 "ff5163ea-cb5c-4191-8e5d-a510c16f57f4" => 270
 "ff55db26-f61c-493a-89e8-d9661907c21a" => 909
 "ff9129be-18e6-4864-983f-88b484534929" => 704
mkitti commented 1 year ago
julia> h5open("test.h5","w", libver_bounds=v"1.8", meta_block_size=4096) do h5f
            A = attrs(h5f)
            for i in 1:2^12
               A[string(uuid4())] = i
            end
       end;

julia> byidx = @time h5open("test.h5") do h5f
           map(1:2^10) do i
               ha = HDF5.Attribute(HDF5.API.h5a_open_by_idx(h5f, ".", HDF5.API.H5_INDEX_NAME, HDF5.API.H5_ITER_INC, i-1, HDF5.API.H5P_DEFAULT, HDF5.API.H5P_DEFAULT), h5f)
               HDF5.API.h5a_get_name(ha) => read(ha)
           end
       end;
 10.007194 seconds (57.26 k allocations: 3.597 MiB, 1.26% compilation time)

julia> bych = @time h5open("test.h5") do h5f
                  HDF5.API.h5a_iterate(Channel, h5f, HDF5.API.H5_INDEX_NAME, HDF5.API.H5_ITER_INC) do ch
                      map(ch) do (loc, name, info)
                          name = unsafe_string(name)
                          name => attrs(h5f)[name]
                      end
                  end
              end;
  0.233691 seconds (131.16 k allocations: 5.636 MiB, 48.91% compilation time)

julia> h5open("test.h5","w", libver_bounds=v"1.8", meta_block_size=4096) do h5f
            A = attrs(h5f)
            for i in 1:2^14
               A[string(uuid4())] = i
            end
       end;

julia> byidx = @time h5open("test.h5") do h5f
           map(1:2^10) do i
               ha = HDF5.Attribute(HDF5.API.h5a_open_by_idx(h5f, ".", HDF5.API.H5_INDEX_NAME, HDF5.API.H5_ITER_INC, i-1, HDF5.API.H5P_DEFAULT, HDF5.API.H5P_DEFAULT), h5f)
               HDF5.API.h5a_get_name(ha) => read(ha)
           end
       end;
 48.086717 seconds (57.38 k allocations: 3.600 MiB, 0.26% compilation time)

julia> bych = @time h5open("test.h5") do h5f
                  HDF5.API.h5a_iterate(Channel, h5f, HDF5.API.H5_INDEX_NAME, HDF5.API.H5_ITER_INC) do ch
                      map(ch) do (loc, name, info)
                          name = unsafe_string(name)
                          name => attrs(h5f)[name]
                      end
                  end
              end;
  0.764359 seconds (443.73 k allocations: 13.711 MiB, 5.32% gc time, 16.66% compilation time)

julia> h5open("test.h5","w", libver_bounds=v"1.8", meta_block_size=4096) do h5f
            A = attrs(h5f)
            for i in 1:2^16
               A[string(uuid4())] = i
            end
       end;

julia> byidx = @time h5open("test.h5") do h5f
           map(1:2^10) do i
               ha = HDF5.Attribute(HDF5.API.h5a_open_by_idx(h5f, ".", HDF5.API.H5_INDEX_NAME, HDF5.API.H5_ITER_INC, i-1, HDF5.API.H5P_DEFAULT, HDF5.API.H5P_DEFAULT), h5f)
               HDF5.API.h5a_get_name(ha) => read(ha)
           end
       end;
205.644635 seconds (57.40 k allocations: 3.604 MiB, 0.06% compilation time)

julia> bych = @time h5open("test.h5") do h5f
                  HDF5.API.h5a_iterate(Channel, h5f, HDF5.API.H5_INDEX_NAME, HDF5.API.H5_ITER_INC) do ch
                      map(ch) do (loc, name, info)
                          name = unsafe_string(name)
                          name => attrs(h5f)[name]
                      end
                  end
              end;
  3.557664 seconds (1.49 M allocations: 44.380 MiB, 1.77% gc time, 3.12% compilation time)
mkitti commented 1 year ago

I'm going to work on this in a pirate package: https://github.com/mkitti/HDF5Channels.jl