xiph / rav1e

The fastest and safest AV1 encoder.
BSD 2-Clause "Simplified" License
3.73k stars 253 forks source link

arm64: sse: 16 bpc NEON implementation #3302

Closed barrbrain closed 11 months ago

codecov[bot] commented 11 months ago

Codecov Report

All modified and coverable lines are covered by tests :white_check_mark:

Comparison is base (1bbe178) 88.24% compared to head (4f1ff94) 88.24%.

Additional details and impacted files ```diff @@ Coverage Diff @@ ## master #3302 +/- ## ======================================= Coverage 88.24% 88.24% ======================================= Files 88 88 Lines 28210 28210 ======================================= Hits 24893 24893 Misses 3317 3317 ```

:umbrella: View full report in Codecov by Sentry.
:loudspeaker: Have feedback on the report? Share it here.

barrbrain commented 11 months ago

See https://github.com/xiph/rav1e/pull/3295#issuecomment-1837299284 for a perf trace from before this PR. Here is the same workload after this PR:

# Samples: 190K of event 'cycles'
# Event count (approx.): 106225872817
#
#       Overhead  Command / Shared Object / Symbol
# ..............  ...............................................................................................................................................................................................................
#
   100.00%        rav1e  
       92.23%        rav1e            
          11.33%        [.] rav1e_satd8x8_hbd_neon
            |          
            |--6.71%--rav1e::api::internal::ContextInner<T>::receive_packet
            |          rav1e::api::internal::ContextInner<T>::compute_block_importances (inlined)
            |          <core::slice::iter::Iter<T> as core::iter::traits::iterator::Iterator>::for_each (inlined)
            |          rav1e::api::internal::ContextInner<T>::compute_block_importances::{{closure}} (inlined)
            |          rav1e::api::internal::ContextInner<T>::update_block_importances (inlined)
            |          core::iter::traits::iterator::Iterator::for_each (inlined)
            |          <core::iter::adapters::flatten::FlatMap<I,U,F> as core::iter::traits::iterator::Iterator>::fold (inlined)
            |          <core::iter::adapters::flatten::FlattenCompat<I,U> as core::iter::traits::iterator::Iterator>::fold (inlined)
            |          core::iter::adapters::flatten::FlattenCompat<I,U>::iter_fold (inlined)
            |          <core::iter::adapters::fuse::Fuse<I> as core::iter::traits::iterator::Iterator>::fold (inlined)
            |          <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
            |          <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold (inlined)
            |          core::iter::traits::iterator::Iterator::fold (inlined)
            |          <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold::enumerate::{{closure}} (inlined)
            |          core::iter::adapters::map::map_fold::{{closure}} (inlined)
            |          core::iter::adapters::flatten::FlattenCompat<I,U>::iter_fold::flatten::{{closure}} (inlined)
            |          <core::iter::adapters::flatten::FlattenCompat<I,U> as core::iter::traits::iterator::Iterator>::fold::flatten::{{closure}} (inlined)
            |          <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
            |          <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold (inlined)
            |          core::iter::traits::iterator::Iterator::fold (inlined)
            |          <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold::enumerate::{{closure}} (inlined)
            |          core::iter::adapters::map::map_fold::{{closure}} (inlined)
            |          rav1e::api::internal::ContextInner<T>::update_block_importances::{{closure}}::{{closure}} (inlined)
            |          rav1e::asm::aarch64::dist::get_satd (inlined)
            |          satd8x8_hbd_neon (inlined)
            |          
            |--3.63%--rav1e::me::estimate_motion
            |          |          
            |           --3.25%--rav1e::me::sub_pixel_me (inlined)
            |                     rav1e::me::subpel_diamond_search (inlined)
            |                     rav1e::me::get_subpel_mv_rd (inlined)
            |                     rav1e::me::compute_mv_rd (inlined)
            |                     satd8x8_hbd_neon (inlined)
            |          
             --0.59%--core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &mut F>::call_once
                       core::ops::function::impls::<impl core::ops::function::FnMut<A> for &F>::call_mut (inlined)
                       rav1e::encoder::encode_tile_group::{{closure}} (inlined)
                       rav1e::encoder::encode_tile (inlined)

           7.83%        [.] put_8tap_neon
            |          
            |--6.49%--rav1e::me::estimate_motion
            |          rav1e::me::sub_pixel_me (inlined)
            |          rav1e::me::subpel_diamond_search (inlined)
            |          rav1e::me::get_subpel_mv_rd (inlined)
            |          put_8tap_neon
            |          
             --0.66%--put_8tap_neon
                       put_8tap_neon

           4.80%        [.] <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
            |
            ---rav1e::api::internal::ContextInner<T>::receive_packet
               rav1e::api::internal::ContextInner<T>::compute_block_importances (inlined)
               <core::slice::iter::Iter<T> as core::iter::traits::iterator::Iterator>::for_each (inlined)
               rav1e::api::internal::ContextInner<T>::compute_block_importances::{{closure}} (inlined)
               rav1e::api::internal::ContextInner<T>::update_block_importances (inlined)
               core::iter::traits::iterator::Iterator::for_each (inlined)
               <core::iter::adapters::flatten::FlatMap<I,U,F> as core::iter::traits::iterator::Iterator>::fold (inlined)
               <core::iter::adapters::flatten::FlattenCompat<I,U> as core::iter::traits::iterator::Iterator>::fold (inlined)
               core::iter::adapters::flatten::FlattenCompat<I,U>::iter_fold (inlined)
               <core::iter::adapters::fuse::Fuse<I> as core::iter::traits::iterator::Iterator>::fold (inlined)
               <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
               <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold (inlined)
               core::iter::traits::iterator::Iterator::fold (inlined)
               <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold::enumerate::{{closure}} (inlined)
               core::iter::adapters::map::map_fold::{{closure}} (inlined)
               core::iter::adapters::flatten::FlattenCompat<I,U>::iter_fold::flatten::{{closure}} (inlined)
               <core::iter::adapters::flatten::FlattenCompat<I,U> as core::iter::traits::iterator::Iterator>::fold::flatten::{{closure}} (inlined)
               <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
               |          
                --4.78%--<core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold (inlined)
                          core::iter::traits::iterator::Iterator::fold (inlined)
                          |          
                           --4.73%--<core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold::enumerate::{{closure}} (inlined)
                                     |          
                                      --4.73%--core::iter::adapters::map::map_fold::{{closure}} (inlined)
                                                |          
                                                 --4.68%--rav1e::api::internal::ContextInner<T>::update_block_importances::{{closure}}::{{closure}} (inlined)
                                                           |          
                                                            --1.33%--<v_frame::plane::Plane<T> as rav1e::frame::plane::AsRegion<T>>::region (inlined)
                                                                      |          
                                                                       --0.83%--rav1e::tiling::plane_region::PlaneRegion<T>::new (inlined)
                                                                                 rav1e::tiling::plane_region::PlaneRegion<T>::from_slice (inlined)

           4.25%        [.] rav1e::asm::aarch64::transform::forward::daala_fdct32
            |
            ---rav1e::asm::aarch64::transform::forward::forward_transform_neon
               rav1e::asm::aarch64::transform::forward::daala_fdct32
               |          
                --3.47%--rav1e::asm::aarch64::transform::forward::daala_fdct_ii_32 (inlined)
                          |          
                           --2.35%--rav1e::asm::aarch64::transform::forward::daala_fdst_iv_16_asym (inlined)
                                     |          
                                      --0.56%--rav1e::asm::aarch64::transform::forward::RotateKernel::half_kernel (inlined)

           3.85%        [.] rav1e::rdo::compute_distortion
            |          
             --3.82%--rav1e::rdo::rdo_mode_decision
                       |          
                        --3.72%--rav1e::rdo::inter_frame_rdo_mode_decision (inlined)
                                  <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::for_each (inlined)
                                  core::iter::traits::iterator::Iterator::try_fold (inlined)
                                  <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::for_each::check::{{closure}} (inlined)
                                  rav1e::rdo::inter_frame_rdo_mode_decision::{{closure}} (inlined)
                                  rav1e::rdo::luma_chroma_mode_rdo
                                  rav1e::rdo::luma_chroma_mode_rdo::{{closure}}
                                  rav1e::rdo::compute_distortion
                                  |          
                                  |--2.17%--rav1e::rdo::cdef_dist_wxh (inlined)
                                  |          |          
                                  |          |--1.12%--rav1e::asm::aarch64::dist::cdef_dist::cdef_dist_kernel (inlined)
                                  |          |          |          
                                  |          |           --0.85%--rav1e::activity::apply_ssim_boost (inlined)
                                  |          |                     |          
                                  |          |                      --0.74%--rav1e::activity::ssim_boost_rsqrt (inlined)
                                  |          |          
                                  |           --0.66%--rav1e::rdo::compute_distortion::{{closure}} (inlined)
                                  |                     |          
                                  |                      --0.50%--rav1e::rdo::distortion_scale (inlined)
                                  |          
                                   --1.12%--rav1e::rdo::sse_wxh (inlined)
                                             |          
                                              --0.87%--rav1e::rdo::compute_distortion::{{closure}} (inlined)
                                                        |          
                                                         --0.61%--rav1e::rdo::distortion_scale (inlined)

           3.80%        [.] rav1e::asm::aarch64::transform::forward::daala_fdct64
            |
            ---rav1e::asm::aarch64::transform::forward::forward_transform_neon
               rav1e::asm::aarch64::transform::forward::daala_fdct64
               |          
               |--2.04%--rav1e::asm::aarch64::transform::forward::daala_fdst_iv_32_asym (inlined)
               |          
                --0.67%--rav1e::asm::aarch64::transform::forward::daala_fdct64::butterfly_pair (inlined)

           3.37%        [.] rav1e::asm::aarch64::transform::forward::forward_transform_neon
            |          
             --3.33%--rav1e::asm::aarch64::transform::forward::forward_transform_neon
                       |          
                       |--0.71%--<core::iter::adapters::zip::Zip<A,B> as core::iter::traits::iterator::Iterator>::next (inlined)
                       |          <core::iter::adapters::zip::Zip<A,B> as core::iter::adapters::zip::ZipImpl<A,B>>::next (inlined)
                       |          
                        --0.54%--rav1e::asm::aarch64::transform::forward::transpose_8x8_neon (inlined)

           3.23%        [.] rav1e::encoder::encode_block_post_cdef
            |          
             --2.59%--rav1e::encoder::encode_partition_topdown
                       rav1e::rdo::rdo_partition_decision
                       |          
                       |--1.44%--rav1e::rdo::rdo_partition_simple (inlined)
                       |          rav1e::rdo::rdo_mode_decision
                       |          |          
                       |           --1.39%--rav1e::rdo::inter_frame_rdo_mode_decision (inlined)
                       |                     <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::for_each (inlined)
                       |                     core::iter::traits::iterator::Iterator::try_fold (inlined)
                       |                     <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::for_each::check::{{closure}} (inlined)
                       |                     rav1e::rdo::inter_frame_rdo_mode_decision::{{closure}} (inlined)
                       |                     rav1e::rdo::luma_chroma_mode_rdo
                       |                     rav1e::rdo::luma_chroma_mode_rdo::{{closure}}
                       |                     rav1e::encoder::encode_block_post_cdef
                       |          
                        --1.15%--rav1e::rdo::rdo_partition_none (inlined)
                                  rav1e::rdo::rdo_mode_decision
                                  |          
                                   --1.12%--rav1e::rdo::inter_frame_rdo_mode_decision (inlined)
                                             <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::for_each (inlined)
                                             core::iter::traits::iterator::Iterator::try_fold (inlined)
                                             <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::for_each::check::{{closure}} (inlined)
                                             rav1e::rdo::inter_frame_rdo_mode_decision::{{closure}} (inlined)
                                             rav1e::rdo::luma_chroma_mode_rdo
                                             rav1e::rdo::luma_chroma_mode_rdo::{{closure}}
                                             rav1e::encoder::encode_block_post_cdef

           2.64%        [.] rav1e_sad32x32_hbd_neon
            |          
            |--1.56%--rav1e::api::internal::ContextInner<T>::send_frame
            |          rav1e::api::internal::ContextInner<T>::compute_frame_invariants (inlined)
            |          rav1e::api::internal::ContextInner<T>::compute_lookahead_motion_vectors (inlined)
            |          rav1e::api::lookahead::compute_motion_vectors
            |          rayon::iter::ParallelIterator::for_each (inlined)
            |          rayon::iter::for_each::for_each (inlined)
            |          <rayon::vec::IntoIter<T> as rayon::iter::ParallelIterator>::drive_unindexed (inlined)
            |          rayon::iter::plumbing::bridge (inlined)
            |          <rayon::vec::IntoIter<T> as rayon::iter::IndexedParallelIterator>::with_producer
            |          <rayon::vec::Drain<T> as rayon::iter::IndexedParallelIterator>::with_producer (inlined)
            |          <rayon::iter::plumbing::bridge::Callback<C> as rayon::iter::plumbing::ProducerCallback<I>>::callback (inlined)
            |          rayon::iter::plumbing::bridge_producer_consumer (inlined)
            |          rayon::iter::plumbing::bridge_producer_consumer::helper
            |          rayon::iter::plumbing::Producer::fold_with (inlined)
            |          <rayon::iter::for_each::ForEachConsumer<F> as rayon::iter::plumbing::Folder<T>>::consume_iter (inlined)
            |          core::iter::traits::iterator::Iterator::for_each (inlined)
            |          core::iter::traits::iterator::Iterator::fold (inlined)
            |          core::iter::traits::iterator::Iterator::for_each::call::{{closure}} (inlined)
            |          core::ops::function::impls::<impl core::ops::function::FnMut<A> for &F>::call_mut
            |          rav1e::api::lookahead::compute_motion_vectors::{{closure}} (inlined)
            |          rav1e::me::estimate_tile_motion
            |          rav1e::me::refine_subsampled_sb_motion (inlined)
            |          rav1e::me::refine_subsampled_motion_estimate (inlined)
            |          rav1e::me::full_search
            |          rav1e::me::compute_mv_rd (inlined)
            |          rav1e::asm::aarch64::dist::get_sad (inlined)
            |          sad32x32_hbd_neon (inlined)
            |          
             --1.08%--rav1e::me::estimate_motion
                       rav1e::me::full_pixel_me (inlined)
                       rav1e::me::full_pixel_me::{{closure}}
                       |          
                       |--0.55%--rav1e::me::fullpel_diamond_search (inlined)
                       |          rav1e::me::get_fullpel_mv_rd
                       |          rav1e::me::compute_mv_rd (inlined)
                       |          sad32x32_hbd_neon (inlined)
                       |          
                        --0.53%--rav1e::me::get_best_predictor (inlined)
                                  rav1e::me::get_fullpel_mv_rd
                                  rav1e::me::compute_mv_rd (inlined)
                                  sad32x32_hbd_neon (inlined)

           2.10%        [.] rav1e::asm::aarch64::transform::forward::daala_fdct_ii_16
            |
            ---rav1e::asm::aarch64::transform::forward::forward_transform_neon
               |          
               |--1.42%--rav1e::asm::aarch64::transform::forward::daala_fdct16
               |          rav1e::asm::aarch64::transform::forward::daala_fdct_ii_16
               |          |          
               |          |--0.57%--rav1e::asm::aarch64::transform::forward::daala_fdst_iv_8_asym (inlined)
               |          |          
               |           --0.56%--rav1e::asm::aarch64::transform::forward::daala_fdct_ii_8_asym (inlined)
               |          
                --0.68%--rav1e::asm::aarch64::transform::forward::daala_fdct64
                          rav1e::asm::aarch64::transform::forward::daala_fdct_ii_32_asym (inlined)
                          rav1e::asm::aarch64::transform::forward::daala_fdct_ii_16

           2.10%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_coeffs_lv_map
            |
            ---rav1e::encoder::encode_tx_block
               rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_coeffs_lv_map
               |          
               |--0.87%--rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::encode_coeffs (inlined)
               |          
                --0.63%--rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::encode_coeff_signs (inlined)

           2.08%        [.] core::ops::function::impls::<impl core::ops::function::FnMut<A> for &mut F>::call_mut
            |
            ---rav1e::api::internal::ContextInner<T>::receive_packet
               rav1e::api::internal::ContextInner<T>::compute_block_importances (inlined)
               <core::slice::iter::Iter<T> as core::iter::traits::iterator::Iterator>::for_each (inlined)
               rav1e::api::internal::ContextInner<T>::compute_block_importances::{{closure}} (inlined)
               rav1e::api::internal::ContextInner<T>::update_block_importances (inlined)
               core::iter::traits::iterator::Iterator::for_each (inlined)
               <core::iter::adapters::flatten::FlatMap<I,U,F> as core::iter::traits::iterator::Iterator>::fold (inlined)
               <core::iter::adapters::flatten::FlattenCompat<I,U> as core::iter::traits::iterator::Iterator>::fold (inlined)
               core::iter::adapters::flatten::FlattenCompat<I,U>::iter_fold (inlined)
               <core::iter::adapters::fuse::Fuse<I> as core::iter::traits::iterator::Iterator>::fold (inlined)
               <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
               <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold (inlined)
               core::iter::traits::iterator::Iterator::fold (inlined)
               <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold::enumerate::{{closure}} (inlined)
               core::iter::adapters::map::map_fold::{{closure}} (inlined)
               core::iter::adapters::flatten::FlattenCompat<I,U>::iter_fold::flatten::{{closure}} (inlined)
               <core::iter::adapters::flatten::FlattenCompat<I,U> as core::iter::traits::iterator::Iterator>::fold::flatten::{{closure}} (inlined)
               <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
               <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold (inlined)
               core::iter::traits::iterator::Iterator::fold (inlined)
               <core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::fold::enumerate::{{closure}} (inlined)
               core::iter::adapters::map::map_fold::{{closure}} (inlined)
               core::ops::function::impls::<impl core::ops::function::FnMut<A> for &mut F>::call_mut
               |          
                --2.03%--core::iter::traits::iterator::Iterator::for_each::call::{{closure}} (inlined)
                          rav1e::api::internal::ContextInner<T>::update_block_importances::{{closure}} (inlined)
                          |          
                           --0.77%--rav1e::api::internal::ContextInner<T>::update_block_importances::{{closure}}::{{closure}} (inlined)

           2.06%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
            |          
             --2.01%--rav1e::encoder::encode_tx_block
                       rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_coeffs_lv_map
                       rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::encode_coeffs (inlined)
                       <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
                       |          
                       |--0.80%--rav1e::context::cdf_context::CDFContextLog::push (inlined)
                       |          rav1e::context::cdf_context::CDFContextLogPartition<_>::push (inlined)
                       |          
                        --0.55%--<rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol (inlined)

           1.98%        [.] rav1e::quantize::QuantizationContext::quantize
            |          
             --1.95%--rav1e::encoder::encode_tx_block
                       rav1e::quantize::QuantizationContext::quantize
                       |          
                        --0.79%--core::iter::traits::iterator::Iterator::max (inlined)
                                  core::iter::traits::iterator::Iterator::max_by (inlined)
                                  core::iter::traits::iterator::Iterator::reduce (inlined)
                                  |          
                                   --0.78%--<core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold (inlined)
                                             core::iter::traits::iterator::Iterator::fold (inlined)

           1.95%        [.] rav1e_cdef_dist_kernel_8x8_hbd_neon
            |          
             --1.92%--rav1e::rdo::rdo_mode_decision
                       |          
                        --1.88%--rav1e::rdo::inter_frame_rdo_mode_decision (inlined)
                                  <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::for_each (inlined)
                                  core::iter::traits::iterator::Iterator::try_fold (inlined)
                                  <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::for_each::check::{{closure}} (inlined)
                                  rav1e::rdo::inter_frame_rdo_mode_decision::{{closure}} (inlined)
                                  rav1e::rdo::luma_chroma_mode_rdo
                                  rav1e::rdo::luma_chroma_mode_rdo::{{closure}}
                                  rav1e::rdo::compute_distortion
                                  rav1e::rdo::cdef_dist_wxh (inlined)
                                  rav1e::asm::aarch64::dist::cdef_dist::cdef_dist_kernel (inlined)
                                  cdef_dist_kernel_8x8_hbd_neon (inlined)

           1.59%        [.] rav1e::encoder::encode_tx_block
            |          
             --1.55%--rav1e::encoder::encode_tx_block
                       |          
                        --1.06%--rav1e::encoder::diff (inlined)
                                  |          
                                   --0.61%--<core::iter::adapters::zip::Zip<A,B> as core::iter::traits::iterator::Iterator>::next (inlined)
                                             <core::iter::adapters::zip::Zip<A,B> as core::iter::adapters::zip::ZipImpl<A,B>>::next (inlined)

           1.23%        [.] rav1e::quantize::rust::dequantize
            |          
             --1.22%--rav1e::encoder::encode_tx_block
                       rav1e::quantize::rust::dequantize
                       |          
                        --0.95%--<core::iter::adapters::enumerate::Enumerate<I> as core::iter::traits::iterator::Iterator>::next (inlined)
                                  <core::iter::adapters::zip::Zip<A,B> as core::iter::traits::iterator::Iterator>::next (inlined)
                                  <core::iter::adapters::zip::Zip<A,B> as core::iter::adapters::zip::ZipImpl<A,B>>::next (inlined)

           1.13%        [.] rav1e_sad16x16_hbd_neon
            |
            ---rav1e::me::estimate_motion
               rav1e::me::full_pixel_me (inlined)
               |          
                --1.07%--rav1e::me::full_pixel_me::{{closure}}
                          |          
                           --0.57%--rav1e::me::get_best_predictor (inlined)
                                     rav1e::me::get_fullpel_mv_rd
                                     rav1e::me::compute_mv_rd (inlined)
                                     sad16x16_hbd_neon (inlined)

           1.12%        [.] rav1e::cdef::cdef_filter_superblock
            |          
             --1.05%--rav1e::encoder::encode_frame
                       rav1e::encoder::encode_tile_group (inlined)
                       rav1e::encoder::FrameState<T>::apply_tile_state_mut (inlined)
                       rav1e::encoder::encode_tile_group::{{closure}} (inlined)
                       rav1e::cdef::cdef_filter_tile
                       rav1e::cdef::cdef_filter_superblock

           1.05%        [.] prep_neon
            |          
             --0.76%--rav1e::rdo::rdo_partition_decision

           1.02%        [.] rav1e::asm::aarch64::transform::forward::daala_fdct_ii_8
            |
            ---rav1e::asm::aarch64::transform::forward::forward_transform_neon
               |          
                --0.86%--rav1e::asm::aarch64::transform::forward::daala_fdct32
                          rav1e::asm::aarch64::transform::forward::daala_fdct_ii_32 (inlined)
                          rav1e::asm::aarch64::transform::forward::daala_fdct_ii_16_asym (inlined)
                          rav1e::asm::aarch64::transform::forward::daala_fdct_ii_8

           1.02%        [.] rav1e::lrf::rust::sgrproj_box_ab_r1
            |          
            |--0.51%--rav1e::lrf::sgrproj_stripe_filter
            |          rav1e::lrf::rust::sgrproj_box_ab_r1
            |          |          
            |           --0.51%--rav1e::lrf::rust::sgrproj_box_ab_internal (inlined)
            |          
             --0.50%--rav1e::lrf::sgrproj_solve
                       rav1e::lrf::rust::sgrproj_box_ab_r1
                       |          
                        --0.50%--rav1e::lrf::rust::sgrproj_box_ab_internal (inlined)

           0.99%        [.] rav1e::rdo::luma_chroma_mode_rdo::{{closure}}
            |          
             --0.77%--rav1e::encoder::encode_partition_topdown
                       rav1e::rdo::rdo_partition_decision
                       |          
                        --0.56%--rav1e::rdo::rdo_partition_simple (inlined)
                                  rav1e::rdo::rdo_mode_decision
                                  |          
                                   --0.51%--rav1e::rdo::inter_frame_rdo_mode_decision (inlined)
                                             <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::for_each (inlined)
                                             core::iter::traits::iterator::Iterator::try_fold (inlined)
                                             <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::for_each::check::{{closure}} (inlined)
                                             rav1e::rdo::inter_frame_rdo_mode_decision::{{closure}} (inlined)
                                             rav1e::rdo::luma_chroma_mode_rdo
                                             rav1e::rdo::luma_chroma_mode_rdo::{{closure}}

           0.98%        [.] rav1e::me::get_fullpel_mv_rd
            |
            ---rav1e::me::estimate_motion
               |          
                --0.96%--rav1e::me::full_pixel_me (inlined)
                          |          
                           --0.93%--rav1e::me::full_pixel_me::{{closure}}
                                     |          
                                      --0.50%--rav1e::me::get_best_predictor (inlined)
                                                rav1e::me::get_fullpel_mv_rd

           0.87%        [.] rav1e::asm::aarch64::transform::forward::daala_fdst_iv_16
            |
            ---rav1e::asm::aarch64::transform::forward::forward_transform_neon
               |          
                --0.84%--rav1e::asm::aarch64::transform::forward::daala_fdct64
                          rav1e::asm::aarch64::transform::forward::daala_fdct_ii_32_asym (inlined)
                          rav1e::asm::aarch64::transform::forward::daala_fdst_iv_16

           0.86%        [.] rav1e_sad64x64_hbd_neon
            |
            ---rav1e::me::estimate_motion
               rav1e::me::full_pixel_me (inlined)
               rav1e::me::full_pixel_me::{{closure}}

           0.81%        [.] rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_nz_map_contexts
            |          
             --0.80%--rav1e::encoder::encode_tx_block
                       rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_coeffs_lv_map
                       rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::encode_coeffs (inlined)
                       rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_nz_map_contexts
                       |          
                        --0.76%--rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_nz_map_ctx (inlined)
                                  |          
                                   --0.59%--rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_nz_map_ctx_from_stats (inlined)

           0.77%        [.] rav1e::deblock::sse_size14
           0.76%        [.] rav1e::asm::aarch64::transform::forward::daala_fdst_iv_8
            |
            ---rav1e::asm::aarch64::transform::forward::forward_transform_neon
               |          
                --0.76%--rav1e::asm::aarch64::transform::forward::daala_fdct32
                          rav1e::asm::aarch64::transform::forward::daala_fdct_ii_32 (inlined)
                          rav1e::asm::aarch64::transform::forward::daala_fdct_ii_16_asym (inlined)
                          rav1e::asm::aarch64::transform::forward::daala_fdst_iv_8

           0.70%        [.] rav1e_avg_16bpc_neon
            |          
             --0.51%--rav1e::encoder::encode_partition_topdown
                       |          
                        --0.50%--rav1e::rdo::rdo_partition_decision

           0.67%        [.] rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_nz_mag
            |          
             --0.66%--rav1e::encoder::encode_tx_block
                       rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_coeffs_lv_map
                       rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::encode_coeffs (inlined)
                       rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_nz_map_contexts
                       rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_nz_map_ctx (inlined)
                       rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_nz_mag

           0.66%        [.] rav1e::me::full_pixel_me::{{closure}}
            |
            ---rav1e::me::estimate_motion
               rav1e::me::full_pixel_me (inlined)
               rav1e::me::full_pixel_me::{{closure}}

           0.64%        [.] put_neon
           0.56%        [.] rav1e::lrf::rust::sgrproj_box_f_r1
           0.53%        [.] rav1e::asm::aarch64::transform::forward::daala_fdct16
            |
            ---rav1e::asm::aarch64::transform::forward::forward_transform_neon
               rav1e::asm::aarch64::transform::forward::daala_fdct16

           0.52%        [.] rav1e::predict::PredictionMode::predict_inter
           0.51%        [.] rav1e::predict::PredictionMode::predict_inter_single
           0.49%        [.] rav1e::me::get_subset_predictors
           0.46%        [.] prep_8tap_neon
           0.45%        [.] <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
           0.42%        [.] rav1e::deblock::sse_size6
           0.42%        [.] rav1e::deblock::filter_v_edge
           0.42%        [.] rav1e::me::estimate_motion
           0.42%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.42%        [.] rav1e::partition::BlockSize::from_width_and_height_opt
           0.38%        [.] rav1e::deblock::filter_h_edge
           0.36%        [.] rav1e::encoder::encode_block_pre_cdef
           0.35%        [.] rav1e::lrf::rust::sgrproj_box_ab_r2
           0.32%        [.] rav1e_weighted_sse_32x32_hbd_neon
           0.31%        [.] rav1e_weighted_sse_16x16_hbd_neon
           0.29%        [.] rav1e::deblock::sse_v_edge
           0.28%        [.] rav1e::encoder::write_tx_tree
           0.27%        [.] rav1e::rdo::rdo_mode_decision
           0.26%        [.] rav1e::context::block_unit::BlockContext::get_txb_ctx
           0.26%        [.] rav1e::encoder::motion_compensate
           0.25%        [.] rav1e::encoder::encode_block_post_cdef
           0.24%        [.] rav1e::deblock::filter_wide14_12
           0.23%        [.] rav1e::deblock::sse_h_edge
           0.22%        [.] rav1e::partition::BlockSize::largest_chroma_tx_size
           0.21%        [.] rav1e::api::lookahead::estimate_importance_block_difference
           0.19%        [.] rav1e::me::estimate_tile_motion
           0.18%        [.] rav1e::deblock::deblock_size14_inner
           0.17%        [.] rav1e::lrf::rust::sgrproj_box_f_r2
           0.17%        [.] inv_dct_4s_x16_neon
           0.17%        [.] rav1e::context::partition_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_segmentation
           0.16%        [.] rav1e::me::full_search
           0.16%        [.] rav1e::lrf::sgrproj_solve
           0.16%        [.] rav1e::partition::get_intra_edges
           0.16%        [.] rav1e::context::frame_header::<impl rav1e::context::cdf_context::ContextWriter>::write_ref_frames
           0.16%        [.] rav1e::context::block_unit::FrameBlocks::new
           0.15%        [.] rav1e::rdo::rdo_tx_size_type
           0.15%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::fill_neighbours_ref_counts
           0.14%        [.] inv_dct32_odd_4s_x16_neon
           0.13%        [.] rav1e::partition::BlockSize::from_width_and_height_opt
           0.13%        [.] rav1e::quantize::QuantizationContext::update
           0.12%        [.] rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_br_ctx
           0.12%        [.] rav1e::lrf::sgrproj_stripe_filter
           0.12%        [.] rav1e::deblock::deblock_size6_inner
           0.12%        [.] rav1e_inv_dct32_odd_8h_x16_neon
           0.12%        [.] rav1e_inv_dct_8h_x16_neon
           0.12%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::find_mvrefs
           0.11%        [.] rav1e::activity::variance_8x8
           0.11%        [.] rav1e::context::partition_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_segment_pred
           0.10%        [.] inv_txfm_horz_dct_32x4_neon
           0.10%        [.] rav1e::transform::forward_shared::Txfm2DFlipCfg::fwd
           0.10%        [.] rav1e::deblock::deblock_size
           0.10%        [.] rav1e::rdo::rdo_loop_plane_error
           0.08%        [.] rav1e::context::block_unit::BlockContext::intra_inter_context
           0.08%        [.] rav1e::rdo::spatiotemporal_scale
           0.08%        [.] inv_txfm_add_vert_dct_8x32_neon
           0.08%        [.] rav1e::tiling::plane_region::PlaneRegionMut<T>::scratch_copy
           0.08%        [.] <v_frame::frame::Frame<T> as rav1e::frame::FrameAlloc>::new
           0.08%        [.] rav1e::context::partition_unit::<impl rav1e::context::block_unit::BlockContext>::reset_skip_context
           0.08%        [.] v_frame::plane::Plane<T>::downsampled
           0.08%        [.] memset@plt
           0.07%        [.] rav1e::rdo::rdo_loop_decision
           0.07%        [.] rav1e_sad64x32_hbd_neon
           0.07%        [.] rav1e::context::block_unit::BlockContext::set_coeff_context
           0.07%        [.] rav1e::lrf::setup_integral_image
           0.06%        [.] rav1e::partition::BlockSize::from_width_and_height_opt
           0.06%        [.] memcpy@plt
           0.06%        [.] <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
           0.06%        [.] rav1e::me::MotionEstimationSubsets::all_mvs
           0.06%        [.] rav1e::context::partition_unit::<impl rav1e::context::block_unit::BlockContext>::skip_context
           0.06%        [.] rav1e::asm::aarch64::transform::forward::daala_fdct8
           0.06%        [.] rav1e::api::lookahead::estimate_inter_costs
           0.05%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::encode_eob
           0.05%        [.] inv_dct64_step1_neon
           0.05%        [.] rav1e::deblock::deblock_filter_optimize
           0.05%        [.] rav1e::rdo::clip_visible_bsize
           0.05%        [.] rav1e_ipred_z2_fill1_16bpc_neon
           0.05%        [.] core::ops::function::impls::<impl core::ops::function::FnMut<A> for &F>::call_mut
           0.05%        [.] core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &mut F>::call_once
           0.05%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::scan_row_mbmi
           0.05%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::add_ref_mv_candidate
           0.05%        [.] rav1e::asm::aarch64::mc::mc_avg
           0.05%        [.] inv_txfm_add_vert_dct_8x64_neon
           0.05%        [.] rav1e::predict::PredictionMode::predict_intra
           0.04%        [.] rav1e::rdo::luma_chroma_mode_rdo
           0.04%        [.] rav1e::api::internal::ContextInner<T>::receive_packet
           0.04%        [.] rav1e::api::lookahead::estimate_intra_costs
           0.04%        [.] inv_txfm_horz_dct_64x4_neon
           0.04%        [.] core::cmp::PartialOrd::lt
           0.04%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.04%        [.] rav1e_weighted_sse_4x4_hbd_neon
           0.04%        [.] rav1e::encoder::save_block_motion
           0.04%        [.] <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
           0.04%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::scan_col_mbmi
           0.04%        [.] rav1e::ec::rust::update_cdf
           0.04%        [.] __aarch64_ldadd4_rel
           0.04%        [.] inv_dct64_step2_neon
           0.03%        [.] inv_dct64_step2_neon
           0.03%        [.] rav1e::context::transform_unit::get_tx_set
           0.03%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.03%        [.] rav1e::context::partition_unit::<impl rav1e::context::block_unit::BlockContext>::partition_plane_context
           0.03%        [.] rav1e::context::partition_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_partition
           0.03%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_coeffs_lv_map
           0.03%        [.] inv_dct64_step1_neon
           0.03%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_comp_ref_type_ctx
           0.03%        [.] rav1e::encoder::encode_block_with_modes
           0.03%        [.] rav1e::asm::aarch64::transform::forward::daala_fdct4
           0.03%        [.] rav1e::encoder::encode_block_pre_cdef
           0.03%        [.] rav1e::partition::BlockSize::from_width_and_height
           0.03%        [.] v_frame::plane::Plane<T>::pad
           0.03%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.03%        [.] inv_txfm_dct_clear_4s_x64_neon
           0.03%        [.] rav1e::context::partition_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_segmentation
           0.03%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.03%        [.] rav1e::ec::rust::update_cdf
           0.03%        [.] rav1e::encoder::encode_partition_topdown
           0.03%        [.] rav1e::encoder::CodedFrameData<T>::compute_spatiotemporal_scores
           0.03%        [.] rav1e::cdef::cdef_analyze_superblock
           0.03%        [.] rav1e::asm::aarch64::transform::inverse::inverse_transform_add
           0.02%        [.] inv_txfm_add_vert_8x16_neon
           0.02%        [.] rav1e::deblock::sse_size8
           0.02%        [.] rav1e::encoder::encode_partition_bottomup
           0.02%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.02%        [.] rav1e::encoder::write_tx_blocks
           0.02%        [.] inv_txfm_horz_16x4_neon
           0.02%        [.] <arrayvec::arrayvec::ArrayVec<T,_> as core::iter::traits::collect::FromIterator<T>>::from_iter
           0.02%        [.] rav1e_weighted_sse_8x8_hbd_neon
           0.02%        [.] rav1e_sad8x8_hbd_neon
           0.02%        [.] rav1e::rdo::rdo_cfl_alpha::{{closure}}::{{closure}}
           0.02%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::get_comp_mode_ctx
           0.02%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_inter_mode
           0.02%        [.] __aarch64_cas4_acq
           0.02%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_compound_mode
           0.02%        [.] rav1e::partition::BlockSize::largest_chroma_tx_size
           0.02%        [.] rav1e_inv_txfm_dct_8h_x64_neon
           0.02%        [.] rav1e_ipred_paeth_16bpc_neon
           0.02%        [.] rav1e::rdo::rdo_partition_decision
           0.02%        [.] <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
           0.02%        [.] core::cmp::PartialOrd::ge
           0.02%        [.] rav1e::encoder::encode_tx_block
           0.02%        [.] rav1e::partition::has_tr
           0.02%        [.] rav1e_cdef_find_dir_16bpc_neon
           0.02%        [.] rav1e::context::frame_header::<impl rav1e::context::cdf_context::ContextWriter>::write_ref_frames
           0.02%        [.] rav1e::asm::aarch64::dist::cdef_dist::cdef_dist_kernel
           0.02%        [.] rav1e_inv_txfm_add_dct_dct_32x32_16bpc_neon
           0.02%        [.] rav1e_ipred_z3_fill1_16bpc_neon
           0.02%        [.] rav1e::activity::ActivityMask::fill_scales
           0.02%        [.] rav1e_ipred_cfl_128_16bpc_neon
           0.02%        [.] rav1e::deblock::sse_size4
           0.02%        [.] cdef_filter8_pri_16bpc_neon
           0.01%        [.] rav1e_ipred_smooth_16bpc_neon
           0.01%        [.] rav1e::asm::aarch64::cdef::cdef_filter_block
           0.01%        [.] rav1e_ipred_z1_fill1_16bpc_neon
           0.01%        [.] core::slice::sort::merge_sort
           0.01%        [.] rav1e::context::<impl rav1e::context::cdf_context::ContextWriter>::encode_mv_component
           0.01%        [.] inv_txfm_add_8x8_neon
           0.01%        [.] cdef_filter4_sec_16bpc_neon
           0.01%        [.] core::slice::sort::insertion_sort_shift_left
           0.01%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_mv
           0.01%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.01%        [.] inv_dct_4s_x8_neon
           0.01%        [.] <T as alloc::vec::spec_from_elem::SpecFromElem>::from_elem
           0.01%        [.] rav1e::lrf::rust::sgrproj_box_f_r0
           0.01%        [.] rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_tx_type
           0.01%        [.] rav1e_ipred_dc_128_16bpc_neon
           0.01%        [.] rav1e::deblock::deblock_adjusted_level
           0.01%        [.] rav1e_cdef_padding4_16bpc_neon
           0.01%        [.] rav1e::ec::rust::update_cdf
           0.01%        [.] rav1e_inv_dct_8h_x8_neon
           0.01%        [.] rav1e::encoder::check_lf_queue
           0.01%        [.] <rav1e::ec::WriterBase<rav1e::ec::WriterEncoder> as rav1e::ec::StorageBackend>::store
           0.01%        [.] cdef_filter8_sec_16bpc_neon
           0.01%        [.] rav1e::deblock::deblock_size8_inner
           0.01%        [.] crossbeam_deque::deque::Stealer<T>::steal
           0.01%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.01%        [.] core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &mut F>::call_once
           0.01%        [.] rav1e::context::block_unit::BlockContext::checkpoint
           0.01%        [.] rav1e_put_8tap_regular_16bpc_neon
           0.01%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::encode_eob
           0.01%        [.] rav1e_ipred_smooth_h_16bpc_neon
           0.01%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.01%        [.] rav1e::activity::ActivityMask::from_plane
           0.01%        [.] rav1e::deblock::deblock_size4_inner
           0.01%        [.] arrayvec::arrayvec::ArrayVec<T,_>::push
           0.01%        [.] rav1e::asm::aarch64::predict::ipred_z1
           0.01%        [.] rav1e::context::partition_unit::<impl rav1e::context::block_unit::BlockContext>::update_partition_context
           0.01%        [.] idct_dc_w32_neon
           0.01%        [.] rav1e::asm::aarch64::transform::forward::daala_fdst16
           0.01%        [.] inv_adst_4s_x16_neon
           0.01%        [.] alloc::raw_vec::RawVec<T,A>::reserve_for_push
           0.01%        [.] rav1e::ec::rust::update_cdf
           0.01%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.01%        [.] core::slice::sort::insertion_sort_shift_left
           0.01%        [.] <arrayvec::arrayvec::ArrayVec<T,_> as core::clone::Clone>::clone
           0.01%        [.] rav1e::api::internal::ContextInner<T>::send_frame
           0.00%        [.] rav1e::dist::rust::get_weighted_sse
           0.00%        [.] rav1e::ec::rust::update_cdf
           0.00%        [.] rav1e::partition::supersample_chroma_bsize
           0.00%        [.] rav1e::context::frame_header::<impl rav1e::context::cdf_context::ContextWriter>::write_lrf
           0.00%        [.] core::cmp::PartialOrd::ge
           0.00%        [.] rayon_core::registry::WorkerThread::wait_until_cold
           0.00%        [.] rav1e_ipred_z1_filter_edge_16bpc_neon
           0.00%        [.] inv_txfm_add_16x16_neon
           0.00%        [.] rav1e::context::partition_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_partition
           0.00%        [.] rav1e_inv_txfm_add_dct_dct_64x64_16bpc_neon
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.00%        [.] crossbeam_epoch::default::with_handle
           0.00%        [.] rav1e::partition::BlockSize::subsize
           0.00%        [.] alloc::collections::btree::node::Handle<alloc::collections::btree::node::NodeRef<alloc::collections::btree::node::marker::Mut,K,V,alloc::collections::btree::node::marker::Leaf>,alloc::collections::btree::node::marker::Edge>::insert_recursing
           0.00%        [.] rav1e_cdef_padding8_16bpc_neon
           0.00%        [.] rav1e::asm::aarch64::predict::ipred_z2
           0.00%        [.] core::slice::sort::insertion_sort_shift_left
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.00%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_inter_mode
           0.00%        [.] realloc@plt
           0.00%        [.] crossbeam_deque::deque::Worker<T>::pop
           0.00%        [.] inv_txfm_add_4x4_neon
           0.00%        [.] rav1e_ipred_cfl_16bpc_neon
           0.00%        [.] rav1e_ipred_z2_fill2_16bpc_neon
           0.00%        [.] rav1e_inv_adst_8h_x16_neon
           0.00%        [.] rav1e_ipred_h_16bpc_neon
           0.00%        [.] rav1e_ipred_dc_16bpc_neon
           0.00%        [.] core::cmp::PartialOrd::gt
           0.00%        [.] rav1e::predict::rust::dr_intra_derivative
           0.00%        [.] rav1e::predict::luma_ac
           0.00%        [.] rav1e::tiling::tile_state::TileStateMut<T>::new
           0.00%        [.] idct_dc_w64_neon
           0.00%        [.] cdef_filter4_pri_16bpc_neon
           0.00%        [.] core::cmp::PartialOrd::le
           0.00%        [.] inv_txfm_dct_clear_scale_4s_x64_neon
           0.00%        [.] idct_dc_w8_neon
           0.00%        [.] rav1e_ipred_v_16bpc_neon
           0.00%        [.] rav1e_satd32x32_hbd_neon
           0.00%        [.] rav1e_ipred_smooth_v_16bpc_neon
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.00%        [.] rav1e::asm::aarch64::transform::forward::daala_fdst8
           0.00%        [.] rav1e::util::logexp::blog32_q11
           0.00%        [.] alloc::collections::btree::remove::<impl alloc::collections::btree::node::Handle<alloc::collections::btree::node::NodeRef<alloc::collections::btree::node::marker::Mut,K,V,alloc::collections::btree::node::marker::LeafOrInternal>,alloc::collections::btree::node::marker::KV>>::remove_kv_tracking
           0.00%        [.] idct_dc_w16_neon
           0.00%        [.] <core::iter::adapters::chain::Chain<A,B> as core::iter::traits::iterator::Iterator>::try_fold
           0.00%        [.] __aarch64_ldadd8_rel
           0.00%        [.] rayon_core::sleep::Sleep::sleep
           0.00%        [.] rav1e::asm::aarch64::transform::forward::daala_fdst_vii_4
           0.00%        [.] alloc::raw_vec::finish_grow
           0.00%        [.] rav1e_inv_txfm_add_dct_dct_64x32_16bpc_neon
           0.00%        [.] rav1e::recon_intra::has_bottom_left
           0.00%        [.] alloc::collections::btree::map::entry::VacantEntry<K,V,A>::insert
           0.00%        [.] core::slice::sort::insertion_sort_shift_left
           0.00%        [.] pow@plt
           0.00%        [.] std::sys::unix::locks::futex_mutex::Mutex::lock_contended
           0.00%        [.] core::slice::sort::merge_sort
           0.00%        [.] rav1e::ec::rust::update_cdf
           0.00%        [.] crossbeam_deque::deque::Injector<T>::steal
           0.00%        [.] rayon_core::registry::global_registry
           0.00%        [.] <alloc::boxed::Box<[I]> as core::iter::traits::collect::FromIterator<I>>::from_iter
           0.00%        [.] rav1e::encoder::encode_frame
           0.00%        [.] __aarch64_cas8_acq_rel
           0.00%        [.] rav1e::do_encode
           0.00%        [.] <alloc::collections::btree::map::Iter<K,V> as core::iter::traits::iterator::Iterator>::next
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.00%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_intra_mode_kf
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.00%        [.] rav1e::asm::aarch64::predict::pred_cfl_ac
           0.00%        [.] rav1e::partition::BlockSize::largest_chroma_tx_size
           0.00%        [.] rav1e::recon_intra::has_top_right
           0.00%        [.] rav1e::context::<impl rav1e::context::cdf_context::ContextWriter>::encode_mv_component
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::write_golomb
           0.00%        [.] alloc::collections::btree::map::entry::OccupiedEntry<K,V,A>::remove_kv
           0.00%        [.] <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
           0.00%        [.] memmove@plt
           0.00%        [.] inv_txfm_horz_scale_16x4_neon
           0.00%        [.] rav1e::cdef::cdef_filter_tile
           0.00%        [.] rav1e::util::logexp::bexp64
           0.00%        [.] core::ptr::drop_in_place<rav1e::encoder::ReferenceFramesSet<u16>>
           0.00%        [.] rav1e::stats::build_frame_summary
           0.00%        [.] rav1e::lrf::RestorationState::lrf_filter_frame
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_bits
           0.00%        [.] <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
           0.00%        [.] __aarch64_ldadd8_relax
           0.00%        [.] malloc@plt
           0.00%        [.] crossbeam_epoch::internal::Global::try_advance
           0.00%        [.] __aarch64_ldadd8_acq_rel
           0.00%        [.] rav1e::context::transform_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_tx_type
           0.00%        [.] crossbeam_deque::deque::Injector<T>::push
           0.00%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_mv
           0.00%        [.] rav1e::rate::QuantizerParameters::new_from_log_q
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.00%        [.] rav1e_satd64x32_hbd_neon
           0.00%        [.] rav1e::ec::rust::update_cdf
           0.00%        [.] rav1e_satd64x64_hbd_neon
           0.00%        [.] inv_txfm_horz_scale_dct_32x4_neon
           0.00%        [.] rav1e_inv_dct_4h_x4_neon
           0.00%        [.] rav1e_sad16x4_hbd_neon
           0.00%        [.] y4m::Decoder<R>::read_frame
           0.00%        [.] core::option::Option<&T>::cloned
           0.00%        [.] rav1e_ipred_z3_fill2_16bpc_neon
           0.00%        [.] rav1e::context::block_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_compound_mode
           0.00%        [.] rav1e::util::kmeans::scan
           0.00%        [.] rav1e_inv_txfm_add_dct_dct_8x8_16bpc_neon
           0.00%        [.] core::fmt::write
           0.00%        [.] core::fmt::num::imp::<impl core::fmt::Display for usize>::fmt
           0.00%        [.] <rav1e::stats::FrameSummary as core::fmt::Display>::fmt
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.00%        [.] <rayon::iter::plumbing::bridge::Callback<C> as rayon::iter::plumbing::ProducerCallback<I>>::callback
           0.00%        [.] rav1e::encoder::Sequence::get_skip_mode_allowed
           0.00%        [.] rav1e::encoder::FrameState<T>::new_with_frame
           0.00%        [.] <bitstream_io::write::BitWriter<W,E> as bitstream_io::write::BitWrite>::write
           0.00%        [.] rav1e::encoder::FrameInvariants<T>::new_inter_frame
           0.00%        [.] rav1e::rate::RCState::calc_flat_quantizer
           0.00%        [.] rav1e::api::internal::ContextInner<T>::compute_keyframe_placement
           0.00%        [.] rav1e::encoder::FrameState<T>::new_with_frame_and_me_stats_and_rec
           0.00%        [.] <rayon_core::job::StackJob<L,F,R> as rayon_core::job::Job>::execute
           0.00%        [.] <rayon_core::latch::LatchRef<L> as rayon_core::latch::Latch>::set
           0.00%        [.] crossbeam_epoch::sync::queue::Queue<T>::try_pop_if
           0.00%        [.] crossbeam_epoch::internal::Global::collect
           0.00%        [.] core::ptr::drop_in_place<rav1e::encoder::CodedFrameData<u16>>
           0.00%        [.] <rayon_core::job::StackJob<L,F,R> as rayon_core::job::Job>::execute
           0.00%        [.] v_frame::plane::Plane<T>::copy_from_raw_u8
           0.00%        [.] <bitstream_io::write::BitWriter<W,bitstream_io::BigEndian> as rav1e::header::UncompressedHeader>::write_delta_q
           0.00%        [.] core::slice::sort::merge_sort
           0.00%        [.] rav1e_satd16x16_hbd_neon
           0.00%        [.] rav1e_sad16x8_hbd_neon
           0.00%        [.] rav1e_ipred_z1_fill2_16bpc_neon
           0.00%        [.] rav1e::ec::WriterBase<rav1e::ec::WriterEncoder>::done
           0.00%        [.] rav1e_prep_8tap_regular_16bpc_neon
           0.00%        [.] rav1e::context::partition_unit::<impl rav1e::context::cdf_context::ContextWriter>::write_cfl_alphas
           0.00%        [.] rav1e_inv_txfm_add_dct_dct_4x4_16bpc_neon
           0.00%        [.] free@plt
           0.00%        [.] <fern::log_impl::Dispatch as log::Log>::log
           0.00%        [.] rav1e::asm::aarch64::predict::ipred_z3
           0.00%        [.] rav1e::cdef::cdef_analyze_superblock_range
           0.00%        [.] rav1e_ipred_z1_upsample_edge_16bpc_neon
           0.00%        [.] inv_adst_4s_x4_neon
           0.00%        [.] core::ops::function::impls::<impl core::ops::function::FnMut<A> for &F>::call_mut
           0.00%        [.] exp@plt
           0.00%        [.] rayon::math::simplify_range
           0.00%        [.] core::ptr::drop_in_place<rayon::vec::Drain<rav1e::tiling::tiler::TileContextMut<u16>>>
           0.00%        [.] __aarch64_cas8_acq
           0.00%        [.] rav1e_inv_txfm_add_dct_dct_32x16_16bpc_neon
           0.00%        [.] <arrayvec::arrayvec::ArrayVec<T,_> as core::iter::traits::collect::FromIterator<T>>::from_iter
           0.00%        [.] rav1e_sad32x64_hbd_neon
           0.00%        [.] <rayon::vec::IntoIter<T> as rayon::iter::IndexedParallelIterator>::with_producer
           0.00%        [.] inv_adst_4s_x8_neon
           0.00%        [.] rav1e::encoder::FrameInvariants<T>::set_quantizers
           0.00%        [.] core::cmp::PartialOrd::ge
           0.00%        [.] alloc::collections::btree::node::Handle<alloc::collections::btree::node::NodeRef<alloc::collections::btree::node::marker::Mut,K,V,alloc::collections::btree::node::marker::Leaf>,alloc::collections::btree::node::marker::Edge>::insert_recursing
           0.00%        [.] rav1e::quantize::select_dc_qi
           0.00%        [.] <v_frame::frame::Frame<T> as core::clone::Clone>::clone
           0.00%        [.] rav1e::rate::RCState::select_qi
           0.00%        [.] inv_dct_4s_x4_neon
           0.00%        [.] rav1e_inv_adst_8h_x8_neon
           0.00%        [.] std::io::Write::write_fmt
           0.00%        [.] rav1e_ipred_z2_fill3_16bpc_neon
           0.00%        [.] rav1e::api::lookahead::compute_motion_vectors
           0.00%        [.] alloc::raw_vec::RawVec<T,A>::reserve::do_reserve_and_handle
           0.00%        [.] rayon::iter::plumbing::bridge_producer_consumer::helper
           0.00%        [.] rav1e_satd32x16_hbd_neon
           0.00%        [.] alloc::raw_vec::RawVec<T,A>::reserve_for_push
           0.00%        [.] rav1e_ipred_reverse_16bpc_neon
           0.00%        [.] <rav1e::tiling::tiler::TileContextIterMut<T> as core::iter::traits::iterator::Iterator>::next
           0.00%        [.] core::fmt::Formatter::pad_integral::write_prefix
           0.00%        [.] rav1e::segmentation::segmentation_optimize
           0.00%        [.] alloc::collections::vec_deque::VecDeque<T,A>::grow
           0.00%        [.] alloc::collections::btree::remove::<impl alloc::collections::btree::node::Handle<alloc::collections::btree::node::NodeRef<alloc::collections::btree::node::marker::Mut,K,V,alloc::collections::btree::node::marker::Leaf>,alloc::collections::btree::node::marker::KV>>::remove_leaf_kv
           0.00%        [.] <alloc::borrow::Cow<B> as core::fmt::Display>::fmt
           0.00%        [.] <bitstream_io::write::BitWriter<W,bitstream_io::BigEndian> as rav1e::header::UncompressedHeader>::write_deblock_filter_a
           0.00%        [.] alloc::collections::btree::remove::<impl alloc::collections::btree::node::Handle<alloc::collections::btree::node::NodeRef<alloc::collections::btree::node::marker::Mut,K,V,alloc::collections::btree::node::marker::LeafOrInternal>,alloc::collections::btree::node::marker::KV>>::remove_kv_tracking
           0.00%        [.] core::ptr::drop_in_place<rav1e::encoder::ReferenceFrame<u16>>
           0.00%        [.] <bitstream_io::write::BitWriter<W,bitstream_io::BigEndian> as rav1e::header::UncompressedHeader>::write_segment_data
           0.00%        [.] <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
           0.00%        [.] rav1e::decoder::y4m::<impl rav1e::decoder::Decoder for y4m::Decoder<alloc::boxed::Box<dyn std::io::Read+core::marker::Send>>>::read_frame
           0.00%        [.] <bitstream_io::write::BitWriter<W,E> as bitstream_io::write::BitWrite>::write
           0.00%        [.] rav1e::lrf::RestorationPlane::restoration_unit_by_stripe
           0.00%        [.] fern::log_impl::LevelConfiguration::find_exact
           0.00%        [.] core::fmt::float::<impl core::fmt::Display for f64>::fmt
           0.00%        [.] <rav1e::stats::ProgressInfo as core::fmt::Display>::fmt
           0.00%        [.] <bitstream_io::write::BitWriter<W,bitstream_io::BigEndian> as rav1e::header::ULEB128Writer>::write_uleb128
           0.00%        [.] rayon::iter::collect::collect_with_consumer
           0.00%        [.] alloc::sync::Arc<T,A>::drop_slow
           0.00%        [.] alloc::collections::btree::map::BTreeMap<K,V,A>::get
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.00%        [.] rav1e::context::cdf_context::CDFContext::reset_counts
           0.00%        [.] <bitstream_io::write::BitWriter<W,E> as bitstream_io::write::BitWrite>::write
           0.00%        [.] <v_frame::frame::Frame<T> as rav1e::frame::FramePad>::pad
           0.00%        [.] <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
           0.00%        [.] rav1e::stats::ProgressInfo::print_chroma_prediction_mode_summary_by_frame_type
           0.00%        [.] <bitstream_io::write::BitWriter<W,bitstream_io::BigEndian> as rav1e::header::UncompressedHeader>::write_frame_header_obu
           0.00%        [.] rayon_core::join::join_context::{{closure}}
           0.00%        [.] core::num::flt2dec::strategy::dragon::format_exact
           0.00%        [.] <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold
           0.00%        [.] alloc::collections::btree::map::entry::OccupiedEntry<K,V,A>::remove_kv
           0.00%        [.] sched_yield@plt
           0.00%        [.] <rav1e::ec::WriterBase<S> as rav1e::ec::Writer>::symbol_with_update
           0.00%        [.] rav1e::context::cdf_context::CDFContext::new
           0.00%        [.] <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once
           0.00%        [.] rav1e_inv_adst_4h_x4_neon
           0.00%        [.] <alloc::collections::btree::map::Keys<K,V> as core::iter::traits::iterator::Iterator>::last
           0.00%        [.] <T as alloc::vec::spec_from_elem::SpecFromElem>::from_elem
           0.00%        [.] core::ptr::drop_in_place<rav1e::encoder::FrameState<u16>>
           0.00%        [.] <std::io::buffered::bufreader::BufReader<R> as std::io::Read>::read
           0.00%        [.] <rayon::iter::unzip::UnzipFolder<OP,FA,FB> as rayon::iter::plumbing::Folder<T>>::consume
           0.00%        [.] <rav1e::cpu_features::aarch64::CpuFeatureLevel as core::default::Default>::default
           0.00%        [.] alloc::sync::Arc<T,A>::make_mut
           0.00%        [.] <std::io::stdio::StdinLock as std::io::Read>::read_exact
           0.00%        [.] core::ptr::drop_in_place<rayon::vec::DrainProducer<rav1e::tiling::tiler::TileContextMut<u16>>>
           0.00%        [.] rayon_core::join::join_context::{{closure}}
           0.00%        [.] rav1e::encoder::FrameInvariants<T>::new_key_frame
           0.00%        [.] rav1e::util::logexp::blog64
           0.00%        [.] v_frame::plane::Plane<T>::probe_padding
           0.00%        [.] rav1e::encoder::FrameInvariants<T>::set_ref_frame_sign_bias
           0.00%        [.] std::env::_var_os
           0.00%        [.] rayon_core::registry::Registry::in_worker_cold
           0.00%        [.] clap_builder::parser::arg_matcher::ArgMatcher::fill_in_global_values
           0.00%        [.] clap_builder::builder::command::Command::arg
        6.87%        libc.so.6        
           2.13%        [.] 0x0000000000099e50
            |          
             --1.98%--rav1e::asm::aarch64::transform::forward::forward_transform_neon
                       rav1e::asm::aarch64::transform::forward::daala_fdct64
                       0xffff82369e50

           2.08%        [.] 0x0000000000099e48
            |          
             --1.95%--rav1e::asm::aarch64::transform::forward::forward_transform_neon
                       rav1e::asm::aarch64::transform::forward::daala_fdct64
                       0xffff82369e48