8-bit AVX2 MC HV* Optimisations

frankplow commented 5 months ago

The functions ff_h2656_put_(uni_)?{4,8}tap_hv32_8_avx2 are defined in libavcodec/x86/h26x/h2656_inter.asm. They are used directly in the HEVC decoder, and also indirectly to define optimisations for larger sizes ff_h2656_put_(uni_)?{4,8}tap_hv{64,128}_8_avx2, using the helper mc_rep_func.

None of these functions are currently used in the VVC decoder. As HEVC does not have size-128 CUs, the size-128 functions are not used anywhere. The patch below updates the VVC decoder to use these optimisations.

 libavcodec/x86/vvc/vvcdsp_init.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 4b4a2aa937..b341e2e85a 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -162,7 +162,10 @@ FW_PUT_SSE4(12)
     FW_PUT(n ## tap_h128,  bitd, avx2)  \
     FW_PUT(n ## tap_v32,   bitd, avx2)  \
     FW_PUT(n ## tap_v64,   bitd, avx2)  \
-    FW_PUT(n ## tap_v128,  bitd, avx2)
+    FW_PUT(n ## tap_v128,  bitd, avx2)  \
+    FW_PUT(n ## tap_hv32,  bitd, avx2)  \
+    FW_PUT(n ## tap_hv64,  bitd, avx2)  \
+    FW_PUT(n ## tap_hv128, bitd, avx2)

 #define FW_PUT_AVX2(bitd) \
     FW_PUT(pixels32,  bitd, avx2) \
@@ -178,10 +181,7 @@ FW_PUT_AVX2(12)
 #define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
     FW_PUT(n ## tap_h16,   bitd, avx2) \
     FW_PUT(n ## tap_v16,   bitd, avx2) \
-    FW_PUT(n ## tap_hv16,  bitd, avx2) \
-    FW_PUT(n ## tap_hv32,  bitd, avx2) \
-    FW_PUT(n ## tap_hv64,  bitd, avx2) \
-    FW_PUT(n ## tap_hv128, bitd, avx2)
+    FW_PUT(n ## tap_hv16,  bitd, avx2)

 #define FW_PUT_16BPC_AVX2(bitd)     \
     FW_PUT(pixels16, bitd, avx2)    \
@@ -281,6 +281,9 @@ ALF_FUNCS(16, 12, avx2)
         PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32,  bd, avx2)  \
         PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64,  bd, avx2)  \
         PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2)  \
+        PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32,  bd, avx2) \
+        PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64,  bd, avx2) \
+        PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
     } while (0)

 #define MC_LINKS_AVX2(bd)                                            \
@@ -292,9 +295,6 @@ ALF_FUNCS(16, 12, avx2)
         PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2)   \
         PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2)   \
         PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2)  \
-        PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2)  \
-        PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2)  \
-        PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
     } while (0)

 #define MC_LINKS_16BPC_AVX2(bd)                                      \

Unfortunately, applying this patch results in checkasm and the conformance bitstreams failing. It appears that there is some difference between HEVC and VVC that is not being accounted for. The problem is specific to 8-bit HV; 8-bit H and V functions and HV functions for >= 10-bit are already in use and work correctly.

frankplow commented 5 months ago

Also the HEVC decoder uses AVX2 for the 4-tap HV32 but not the 8-tap, perhaps we'd want to add the patch below?

diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 2c0fca303e..ca9ee4d60c 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -184,6 +184,7 @@ FW_EPEL_HV(16, 10, avx2)
 FW_QPEL(32,  8, avx2)
 FW_QPEL(16, 10, avx2)

+FW_QPEL_HV(32, 8, avx2)
 FW_QPEL_HV(16, 10, avx2)

 #endif

nuomi2021 commented 5 months ago

Hi @frankplow , thank you for the issue. @QSXW could you help this.

thank you

QSXW commented 5 months ago

Hi. What is the error shown when applying this patch?

frankplow commented 5 months ago

@QSXW

Hi. What is the error shown when applying this patch?

There are no error messages, the patch causes mismatches in the checkasm and conformance suite. In other words, these optimisations do not perform the same operation as their C equivalents.

QSXW commented 5 months ago

Sorry my bad. I mean what is the error shown by checkasm? Can you paste them here?

frankplow commented 5 months ago

@QSXW

Sorry my bad. I mean what is the error shown by checkasm? Can you paste them here?

It only shows that there is a mismatch for these functions:

frank@desk ~/dev/ffmpeg (master *) 
> ./tests/checkasm/checkasm --test=vvc_mc
checkasm: using random seed 837724631
SSE4.1:
 - vvc_mc.put_luma       [OK]
 - vvc_mc.put_uni_luma   [OK]
 - vvc_mc.put_chroma     [OK]
 - vvc_mc.put_uni_chroma [OK]
AVX2:
 - vvc_mc.sad            [OK]
   put_luma_hv_8_32x4_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x4_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x4_avx2 (vvc_mc.c:103)
   put_luma_hv_8_32x8_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x8_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x8_avx2 (vvc_mc.c:103)
   put_luma_hv_8_32x16_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x16_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x16_avx2 (vvc_mc.c:103)
   put_luma_hv_8_32x32_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x32_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x32_avx2 (vvc_mc.c:103)
   put_luma_hv_8_32x64_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x64_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x64_avx2 (vvc_mc.c:103)
   put_luma_hv_8_32x128_avx2 (vvc_mc.c:103)
   put_luma_hv_8_64x128_avx2 (vvc_mc.c:103)
   put_luma_hv_8_128x128_avx2 (vvc_mc.c:103)
 - vvc_mc.put_luma       [FAILED]
   put_uni_hv_luma_8_32x4_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x4_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x4_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_32x8_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x8_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x8_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_32x16_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x16_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x16_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_32x32_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x32_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x32_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_32x64_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x64_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x64_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_32x128_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_64x128_avx2 (vvc_mc.c:154)
   put_uni_hv_luma_8_128x128_avx2 (vvc_mc.c:154)
 - vvc_mc.put_uni_luma   [FAILED]
 - vvc_mc.put_chroma     [OK]
 - vvc_mc.put_uni_chroma [OK]
 - vvc_mc.avg            [OK]
checkasm: 36 of 3281 tests have failed

nuomi2021 commented 4 months ago

@QSXW could you help check? thank you

QSXW commented 4 months ago

@QSXW could you help check? thank you

Sorry, recently I got stuck with some personal errands. I'm back now and I will check it by two days.

QSXW commented 4 months ago

FW_QPEL_HV(32, 8, avx2)

Hi @nuomi2021 @frankplow

   put_hevc_qpel_hv32_8_avx2 (tests/checkasm/hevc_pel.c:116)
 - hevc_pel.qpel                         [FAILED]

The hevc actually doesn't use the hv32, where there may have some issue to make them disable the usage of hv32, which means it cannot be used by vvc as well directly.

The hv32 avx2 algorithm doesn't exist. It is only used to make that #define MC_REP_FUNCS_AVX2(fname) will not break the compilation.

frankplow commented 4 months ago

@QSXW

FW_QPEL_HV(32, 8, avx2)
Hi @nuomi2021 @frankplow
   put_hevc_qpel_hv32_8_avx2 (tests/checkasm/hevc_pel.c:116)
 - hevc_pel.qpel                         [FAILED]
The hevc actually doesn't use the hv32, where there may have some issue to make them disable the usage of hv32, which means it cannot be used by vvc as well directly.

The hv32 avx2 algorithm doesn't exist. It is only used to make that #define MC_REP_FUNCS_AVX2(fname) will not break the compilation.

I'm not sure I understand entirely how the HV32 algorithm doesn't exist? What is the code run by, for example, put_uni_hv_luma_8_32x4_avx2 then?

In any case, if it is not trivial to fix/write assembly optimisations for these functions, would you agree the best way forward is to modify MC_REP_FUNCS_AVX2 and any other macros which declare/define these functions? So as not to pollute the namespace with unused symbols and reduce binary size.

QSXW commented 4 months ago

@QSXW
FW_QPEL_HV(32, 8, avx2)
Hi @nuomi2021 @frankplow
   put_hevc_qpel_hv32_8_avx2 (tests/checkasm/hevc_pel.c:116)
 - hevc_pel.qpel                         [FAILED]
The hevc actually doesn't use the hv32, where there may have some issue to make them disable the usage of hv32, which means it cannot be used by vvc as well directly. The hv32 avx2 algorithm doesn't exist. It is only used to make that #define MC_REP_FUNCS_AVX2(fname) will not break the compilation.
I'm not sure I understand entirely how the HV32 algorithm doesn't exist? What is the code run by, for example, put_uni_hv_luma_8_32x4_avx2 then?

The author of hevc asm just doesn't write an available hv32 assembly code. The symbol of hv32_8_avx2 there are just used to make the compilation successful actually.

In any case, if it is not trivial to fix/write assembly optimisations for these functions, would you agree the best way forward is to modify MC_REP_FUNCS_AVX2 and any other macros which declare/define these functions? So as not to pollute the namespace with unused symbols and reduce binary size.

I think it will be difficult to modify the MC_REP_FUNCS_AVX2, so the former author adds a faker entry of hv32 there. And, I think the compiler will strip the unused symbols.

ffvvc / FFmpeg

8-bit AVX2 MC HV* Optimisations #234