Open hipudding opened 5 months ago
[toc]
推理后端 | 内存/显存 |
---|---|
Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz 8核 16线程 | 64G |
Ascend | 64G |
mkdir build
cd build
cmake .. -DLLAMA_CANN=ON
cmake --build . --config Release
Backend 1/1 (CPU)
Backend name: CPU
ADD(type=f32,ne=[1,1,8,1],nr=[1,1,1,1]): 8192 runs - 1.62 us/run - 0 kB/run - 0.06 GB/s
ADD(type=f32,ne=[1,1,1,1],nr=[32,1,1,1]): 8192 runs - 1.60 us/run - 0 kB/run - 0.22 GB/s
ADD(type=f32,ne=[1,1,320,320],nr=[1,1,1,1]): 6991 runs - 449.66 us/run - 1200 kB/run - 2.55 GB/s
ADD(type=f32,ne=[16,10,1,1],nr=[1,1,1,1]): 8192 runs - 1.73 us/run - 1 kB/run - 1.04 GB/s
ADD(type=f32,ne=[16,10,10,1],nr=[1,1,1,1]): 8192 runs - 2.08 us/run - 18 kB/run - 8.58 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,1,1,1]): 8192 runs - 6.03 us/run - 187 kB/run - 29.67 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[2,1,1,1]): 8192 runs - 6.99 us/run - 375 kB/run - 51.16 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,2,1,1]): 8192 runs - 10.77 us/run - 375 kB/run - 33.19 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,1,2,1]): 8192 runs - 10.64 us/run - 375 kB/run - 33.63 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,1,1,2]): 8192 runs - 13.36 us/run - 375 kB/run - 26.77 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,1,2,2]): 8192 runs - 19.62 us/run - 750 kB/run - 36.46 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,2,2,2]): 5593 runs - 37.53 us/run - 1500 kB/run - 38.12 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[2,2,2,2]): 2797 runs - 44.91 us/run - 3000 kB/run - 63.70 GB/s
ADD(type=f32,ne=[1280,1,1,1],nr=[1,1,1,1]): 8192 runs - 1.58 us/run - 15 kB/run - 9.04 GB/s
ADD(type=f32,ne=[1280,1,1,1],nr=[1,16,16,1]): 2185 runs - 22.45 us/run - 3840 kB/run - 163.10 GB/s
ADD(type=f32,ne=[1280,16,16,1],nr=[1,1,1,1]): 2185 runs - 15.38 us/run - 3840 kB/run - 238.15 GB/s
ADD(type=f32,ne=[1280,1,1,1],nr=[1,256,1,1]): 2185 runs - 19.36 us/run - 3840 kB/run - 189.19 GB/s
ADD(type=f32,ne=[1,1,1280,1],nr=[16,16,1,1]): 2185 runs - 279.71 us/run - 3840 kB/run - 13.09 GB/s
ADD(type=f32,ne=[16,16,1280,1],nr=[1,1,1,1]): 2185 runs - 95.61 us/run - 3840 kB/run - 38.30 GB/s
ADD(type=f32,ne=[1,1,1920,1],nr=[16,16,1,1]): 1457 runs - 418.49 us/run - 5760 kB/run - 13.13 GB/s
ADD(type=f32,ne=[1,1,2560,1],nr=[16,16,1,1]): 1093 runs - 572.52 us/run - 7680 kB/run - 12.79 GB/s
ADD(type=f32,ne=[1,1,1280,1],nr=[32,32,1,1]): 547 runs - 918.11 us/run - 15360 kB/run - 15.95 GB/s
ADD(type=f32,ne=[1,1,1920,1],nr=[32,32,1,1]): 365 runs - 1404.75 us/run - 23040 kB/run - 15.64 GB/s
ADD(type=f32,ne=[1,1,640,1],nr=[32,32,1,1]): 1093 runs - 459.20 us/run - 7680 kB/run - 15.95 GB/s
ADD(type=f32,ne=[5120,1,1,1],nr=[1,256,1,1]): 547 runs - 116.24 us/run - 15360 kB/run - 126.02 GB/s
ADD(type=f32,ne=[640,1,1,1],nr=[1,1,1,1]): 8192 runs - 1.55 us/run - 7 kB/run - 4.61 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
ADD(type=f32,ne=[1,1,8,1],nr=[1,1,1,1]): 8192 runs - 7.77 us/run - 0 kB/run - 0.01 GB/s
ADD(type=f32,ne=[1,1,1,1],nr=[32,1,1,1]): 8192 runs - 8.75 us/run - 0 kB/run - 0.04 GB/s
ADD(type=f32,ne=[1,1,320,320],nr=[1,1,1,1]): 8192 runs - 8.14 us/run - 1200 kB/run - 140.57 GB/s
ADD(type=f32,ne=[16,10,1,1],nr=[1,1,1,1]): 8192 runs - 8.16 us/run - 1 kB/run - 0.22 GB/s
ADD(type=f32,ne=[16,10,10,1],nr=[1,1,1,1]): 8192 runs - 8.59 us/run - 18 kB/run - 2.08 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,1,1,1]): 8192 runs - 8.13 us/run - 187 kB/run - 21.98 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[2,1,1,1]): 8192 runs - 8.15 us/run - 375 kB/run - 43.87 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,2,1,1]): 8192 runs - 8.92 us/run - 375 kB/run - 40.08 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,1,2,1]): 8192 runs - 8.05 us/run - 375 kB/run - 44.45 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,1,1,2]): 8192 runs - 8.03 us/run - 375 kB/run - 44.51 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,1,2,2]): 8192 runs - 8.08 us/run - 750 kB/run - 88.49 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[1,2,2,2]): 8192 runs - 8.54 us/run - 1500 kB/run - 167.48 GB/s
ADD(type=f32,ne=[16,10,10,10],nr=[2,2,2,2]): 8192 runs - 9.44 us/run - 3000 kB/run - 303.13 GB/s
ADD(type=f32,ne=[1280,1,1,1],nr=[1,1,1,1]): 8192 runs - 7.93 us/run - 15 kB/run - 1.80 GB/s
ADD(type=f32,ne=[1280,1,1,1],nr=[1,16,16,1]): 8192 runs - 7.90 us/run - 3840 kB/run - 463.72 GB/s
ADD(type=f32,ne=[1280,16,16,1],nr=[1,1,1,1]): 8192 runs - 7.91 us/run - 3840 kB/run - 463.18 GB/s
ADD(type=f32,ne=[1280,1,1,1],nr=[1,256,1,1]): 8192 runs - 8.49 us/run - 3840 kB/run - 431.20 GB/s
ADD(type=f32,ne=[1,1,1280,1],nr=[16,16,1,1]): 8192 runs - 7.85 us/run - 3840 kB/run - 466.67 GB/s
ADD(type=f32,ne=[16,16,1280,1],nr=[1,1,1,1]): 8192 runs - 7.90 us/run - 3840 kB/run - 463.79 GB/s
ADD(type=f32,ne=[1,1,1920,1],nr=[16,16,1,1]): 5826 runs - 7.89 us/run - 5760 kB/run - 696.16 GB/s
ADD(type=f32,ne=[1,1,2560,1],nr=[16,16,1,1]): 4370 runs - 7.88 us/run - 7680 kB/run - 929.89 GB/s
ADD(type=f32,ne=[1,1,1280,1],nr=[32,32,1,1]): 2185 runs - 8.32 us/run - 15360 kB/run - 1759.78 GB/s
ADD(type=f32,ne=[1,1,1920,1],nr=[32,32,1,1]): 1457 runs - 9.81 us/run - 23040 kB/run - 2240.63 GB/s
ADD(type=f32,ne=[1,1,640,1],nr=[32,32,1,1]): 4370 runs - 7.85 us/run - 7680 kB/run - 932.55 GB/s
ADD(type=f32,ne=[5120,1,1,1],nr=[1,256,1,1]): 2185 runs - 9.26 us/run - 15360 kB/run - 1581.05 GB/s
ADD(type=f32,ne=[640,1,1,1],nr=[1,1,1,1]): 8192 runs - 7.88 us/run - 7 kB/run - 0.91 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
ACC(type=f32,ne_a=[1024,577,1,1],ne_b=[1024,576,1,1]): 1213 runs - 244.46 us/run - 6920 kB/run - 27.00 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
ACC(type=f32,ne_a=[1024,577,1,1],ne_b=[1024,576,1,1]): 4849 runs - 12.37 us/run - 6920 kB/run - 533.53 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
MUL(type=f32,ne=[1,1,8,1],nr=[1,1,1,1]): 8192 runs - 2.26 us/run - 0 kB/run - 0.04 GB/s
MUL(type=f32,ne=[1,1,1,1],nr=[32,1,1,1]): 8192 runs - 1.83 us/run - 0 kB/run - 0.20 GB/s
MUL(type=f32,ne=[1,1,320,320],nr=[1,1,1,1]): 6991 runs - 517.90 us/run - 1200 kB/run - 2.21 GB/s
MUL(type=f32,ne=[16,10,1,1],nr=[1,1,1,1]): 8192 runs - 1.98 us/run - 1 kB/run - 0.90 GB/s
MUL(type=f32,ne=[16,10,10,1],nr=[1,1,1,1]): 8192 runs - 2.28 us/run - 18 kB/run - 7.84 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,1,1,1]): 8192 runs - 18.86 us/run - 187 kB/run - 9.48 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[2,1,1,1]): 8192 runs - 8.04 us/run - 375 kB/run - 44.49 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,2,1,1]): 8192 runs - 12.15 us/run - 375 kB/run - 29.44 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,1,2,1]): 8192 runs - 14.02 us/run - 375 kB/run - 25.51 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,1,1,2]): 8192 runs - 12.11 us/run - 375 kB/run - 29.54 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,1,2,2]): 8192 runs - 25.10 us/run - 750 kB/run - 28.50 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,2,2,2]): 5593 runs - 98.92 us/run - 1500 kB/run - 14.46 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[2,2,2,2]): 2797 runs - 76.57 us/run - 3000 kB/run - 37.37 GB/s
MUL(type=f32,ne=[1280,1,1,1],nr=[1,1,1,1]): 8192 runs - 1.79 us/run - 15 kB/run - 7.98 GB/s
MUL(type=f32,ne=[1280,1,1,1],nr=[1,16,16,1]): 2185 runs - 16.44 us/run - 3840 kB/run - 222.70 GB/s
MUL(type=f32,ne=[1280,16,16,1],nr=[1,1,1,1]): 2185 runs - 37.19 us/run - 3840 kB/run - 98.48 GB/s
MUL(type=f32,ne=[1280,1,1,1],nr=[1,256,1,1]): 2185 runs - 25.18 us/run - 3840 kB/run - 145.43 GB/s
MUL(type=f32,ne=[1,1,1280,1],nr=[16,16,1,1]): 2185 runs - 346.94 us/run - 3840 kB/run - 10.56 GB/s
MUL(type=f32,ne=[16,16,1280,1],nr=[1,1,1,1]): 2185 runs - 141.48 us/run - 3840 kB/run - 25.88 GB/s
MUL(type=f32,ne=[1,1,1920,1],nr=[16,16,1,1]): 1457 runs - 547.23 us/run - 5760 kB/run - 10.04 GB/s
MUL(type=f32,ne=[1,1,2560,1],nr=[16,16,1,1]): 1093 runs - 678.00 us/run - 7680 kB/run - 10.80 GB/s
MUL(type=f32,ne=[1,1,1280,1],nr=[32,32,1,1]): 547 runs - 1182.22 us/run - 15360 kB/run - 12.39 GB/s
MUL(type=f32,ne=[1,1,1920,1],nr=[32,32,1,1]): 365 runs - 1821.39 us/run - 23040 kB/run - 12.06 GB/s
MUL(type=f32,ne=[1,1,640,1],nr=[32,32,1,1]): 1093 runs - 604.43 us/run - 7680 kB/run - 12.12 GB/s
MUL(type=f32,ne=[5120,1,1,1],nr=[1,256,1,1]): 547 runs - 96.47 us/run - 15360 kB/run - 151.84 GB/s
MUL(type=f32,ne=[640,1,1,1],nr=[1,1,1,1]): 8192 runs - 1.87 us/run - 7 kB/run - 3.83 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
MUL(type=f32,ne=[1,1,8,1],nr=[1,1,1,1]): 8192 runs - 8.21 us/run - 0 kB/run - 0.01 GB/s
MUL(type=f32,ne=[1,1,1,1],nr=[32,1,1,1]): 8192 runs - 7.80 us/run - 0 kB/run - 0.05 GB/s
MUL(type=f32,ne=[1,1,320,320],nr=[1,1,1,1]): 8192 runs - 7.79 us/run - 1200 kB/run - 146.95 GB/s
MUL(type=f32,ne=[16,10,1,1],nr=[1,1,1,1]): 8192 runs - 7.98 us/run - 1 kB/run - 0.22 GB/s
MUL(type=f32,ne=[16,10,10,1],nr=[1,1,1,1]): 8192 runs - 8.41 us/run - 18 kB/run - 2.13 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,1,1,1]): 8192 runs - 7.73 us/run - 187 kB/run - 23.14 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[2,1,1,1]): 8192 runs - 7.81 us/run - 375 kB/run - 45.77 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,2,1,1]): 8192 runs - 7.82 us/run - 375 kB/run - 45.73 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,1,2,1]): 8192 runs - 7.81 us/run - 375 kB/run - 45.81 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,1,1,2]): 8192 runs - 7.84 us/run - 375 kB/run - 45.63 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,1,2,2]): 8192 runs - 7.88 us/run - 750 kB/run - 90.79 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[1,2,2,2]): 8192 runs - 8.53 us/run - 1500 kB/run - 167.70 GB/s
MUL(type=f32,ne=[16,10,10,10],nr=[2,2,2,2]): 8192 runs - 9.47 us/run - 3000 kB/run - 302.22 GB/s
MUL(type=f32,ne=[1280,1,1,1],nr=[1,1,1,1]): 8192 runs - 7.76 us/run - 15 kB/run - 1.84 GB/s
MUL(type=f32,ne=[1280,1,1,1],nr=[1,16,16,1]): 8192 runs - 7.77 us/run - 3840 kB/run - 471.41 GB/s
MUL(type=f32,ne=[1280,16,16,1],nr=[1,1,1,1]): 8192 runs - 7.75 us/run - 3840 kB/run - 472.52 GB/s
MUL(type=f32,ne=[1280,1,1,1],nr=[1,256,1,1]): 8192 runs - 7.78 us/run - 3840 kB/run - 470.90 GB/s
MUL(type=f32,ne=[1,1,1280,1],nr=[16,16,1,1]): 8192 runs - 7.70 us/run - 3840 kB/run - 475.52 GB/s
MUL(type=f32,ne=[16,16,1280,1],nr=[1,1,1,1]): 8192 runs - 7.75 us/run - 3840 kB/run - 472.57 GB/s
MUL(type=f32,ne=[1,1,1920,1],nr=[16,16,1,1]): 5826 runs - 7.73 us/run - 5760 kB/run - 710.99 GB/s
MUL(type=f32,ne=[1,1,2560,1],nr=[16,16,1,1]): 4370 runs - 7.76 us/run - 7680 kB/run - 943.46 GB/s
MUL(type=f32,ne=[1,1,1280,1],nr=[32,32,1,1]): 2185 runs - 8.27 us/run - 15360 kB/run - 1771.86 GB/s
MUL(type=f32,ne=[1,1,1920,1],nr=[32,32,1,1]): 1457 runs - 9.79 us/run - 23040 kB/run - 2244.40 GB/s
MUL(type=f32,ne=[1,1,640,1],nr=[32,32,1,1]): 4370 runs - 7.67 us/run - 7680 kB/run - 954.52 GB/s
MUL(type=f32,ne=[5120,1,1,1],nr=[1,256,1,1]): 2185 runs - 9.29 us/run - 15360 kB/run - 1577.62 GB/s
MUL(type=f32,ne=[640,1,1,1],nr=[1,1,1,1]): 8192 runs - 7.79 us/run - 7 kB/run - 0.92 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
DIV(type=f32,ne=[1,1,8,1],nr=[1,1,1,1]): 8192 runs - 1.58 us/run - 0 kB/run - 0.06 GB/s
DIV(type=f32,ne=[1,1,1,1],nr=[32,1,1,1]): 8192 runs - 1.56 us/run - 0 kB/run - 0.23 GB/s
DIV(type=f32,ne=[1,1,320,320],nr=[1,1,1,1]): 6991 runs - 524.49 us/run - 1200 kB/run - 2.18 GB/s
DIV(type=f32,ne=[16,10,1,1],nr=[1,1,1,1]): 8192 runs - 1.95 us/run - 1 kB/run - 0.92 GB/s
DIV(type=f32,ne=[16,10,10,1],nr=[1,1,1,1]): 8192 runs - 1.99 us/run - 18 kB/run - 8.97 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,1,1,1]): 8192 runs - 15.88 us/run - 187 kB/run - 11.26 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[2,1,1,1]): 8192 runs - 8.08 us/run - 375 kB/run - 44.26 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,2,1,1]): 8192 runs - 12.05 us/run - 375 kB/run - 29.68 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,1,2,1]): 8192 runs - 11.83 us/run - 375 kB/run - 30.23 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,1,1,2]): 8192 runs - 11.96 us/run - 375 kB/run - 29.91 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,1,2,2]): 8192 runs - 23.32 us/run - 750 kB/run - 30.67 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,2,2,2]): 5593 runs - 110.49 us/run - 1500 kB/run - 12.95 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[2,2,2,2]): 2797 runs - 74.99 us/run - 3000 kB/run - 38.15 GB/s
DIV(type=f32,ne=[1280,1,1,1],nr=[1,1,1,1]): 8192 runs - 1.58 us/run - 15 kB/run - 9.05 GB/s
DIV(type=f32,ne=[1280,1,1,1],nr=[1,16,16,1]): 2185 runs - 17.70 us/run - 3840 kB/run - 206.89 GB/s
DIV(type=f32,ne=[1280,16,16,1],nr=[1,1,1,1]): 2185 runs - 22.97 us/run - 3840 kB/run - 159.46 GB/s
DIV(type=f32,ne=[1280,1,1,1],nr=[1,256,1,1]): 2185 runs - 17.21 us/run - 3840 kB/run - 212.78 GB/s
DIV(type=f32,ne=[1,1,1280,1],nr=[16,16,1,1]): 2185 runs - 358.86 us/run - 3840 kB/run - 10.20 GB/s
DIV(type=f32,ne=[16,16,1280,1],nr=[1,1,1,1]): 2185 runs - 140.07 us/run - 3840 kB/run - 26.14 GB/s
DIV(type=f32,ne=[1,1,1920,1],nr=[16,16,1,1]): 1457 runs - 556.51 us/run - 5760 kB/run - 9.87 GB/s
DIV(type=f32,ne=[1,1,2560,1],nr=[16,16,1,1]): 1093 runs - 769.92 us/run - 7680 kB/run - 9.51 GB/s
DIV(type=f32,ne=[1,1,1280,1],nr=[32,32,1,1]): 547 runs - 1277.11 us/run - 15360 kB/run - 11.47 GB/s
DIV(type=f32,ne=[1,1,1920,1],nr=[32,32,1,1]): 365 runs - 1849.03 us/run - 23040 kB/run - 11.88 GB/s
DIV(type=f32,ne=[1,1,640,1],nr=[32,32,1,1]): 1093 runs - 599.96 us/run - 7680 kB/run - 12.21 GB/s
DIV(type=f32,ne=[5120,1,1,1],nr=[1,256,1,1]): 547 runs - 95.95 us/run - 15360 kB/run - 152.67 GB/s
DIV(type=f32,ne=[640,1,1,1],nr=[1,1,1,1]): 8192 runs - 1.50 us/run - 7 kB/run - 4.77 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
DIV(type=f32,ne=[1,1,8,1],nr=[1,1,1,1]): 8192 runs - 8.19 us/run - 0 kB/run - 0.01 GB/s
DIV(type=f32,ne=[1,1,1,1],nr=[32,1,1,1]): 8192 runs - 7.77 us/run - 0 kB/run - 0.05 GB/s
DIV(type=f32,ne=[1,1,320,320],nr=[1,1,1,1]): 8192 runs - 7.83 us/run - 1200 kB/run - 146.23 GB/s
DIV(type=f32,ne=[16,10,1,1],nr=[1,1,1,1]): 8192 runs - 7.70 us/run - 1 kB/run - 0.23 GB/s
DIV(type=f32,ne=[16,10,10,1],nr=[1,1,1,1]): 8192 runs - 8.56 us/run - 18 kB/run - 2.09 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,1,1,1]): 8192 runs - 7.75 us/run - 187 kB/run - 23.06 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[2,1,1,1]): 8192 runs - 7.74 us/run - 375 kB/run - 46.19 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,2,1,1]): 8192 runs - 7.76 us/run - 375 kB/run - 46.08 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,1,2,1]): 8192 runs - 7.71 us/run - 375 kB/run - 46.41 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,1,1,2]): 8192 runs - 7.70 us/run - 375 kB/run - 46.44 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,1,2,2]): 8192 runs - 7.89 us/run - 750 kB/run - 90.65 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[1,2,2,2]): 8192 runs - 8.12 us/run - 1500 kB/run - 176.08 GB/s
DIV(type=f32,ne=[16,10,10,10],nr=[2,2,2,2]): 8192 runs - 9.09 us/run - 3000 kB/run - 314.91 GB/s
DIV(type=f32,ne=[1280,1,1,1],nr=[1,1,1,1]): 8192 runs - 7.76 us/run - 15 kB/run - 1.84 GB/s
DIV(type=f32,ne=[1280,1,1,1],nr=[1,16,16,1]): 8192 runs - 7.81 us/run - 3840 kB/run - 468.76 GB/s
DIV(type=f32,ne=[1280,16,16,1],nr=[1,1,1,1]): 8192 runs - 7.77 us/run - 3840 kB/run - 471.38 GB/s
DIV(type=f32,ne=[1280,1,1,1],nr=[1,256,1,1]): 8192 runs - 7.65 us/run - 3840 kB/run - 478.88 GB/s
DIV(type=f32,ne=[1,1,1280,1],nr=[16,16,1,1]): 8192 runs - 7.64 us/run - 3840 kB/run - 479.47 GB/s
DIV(type=f32,ne=[16,16,1280,1],nr=[1,1,1,1]): 8192 runs - 7.79 us/run - 3840 kB/run - 470.21 GB/s
DIV(type=f32,ne=[1,1,1920,1],nr=[16,16,1,1]): 5826 runs - 7.70 us/run - 5760 kB/run - 713.00 GB/s
DIV(type=f32,ne=[1,1,2560,1],nr=[16,16,1,1]): 4370 runs - 7.84 us/run - 7680 kB/run - 934.53 GB/s
DIV(type=f32,ne=[1,1,1280,1],nr=[32,32,1,1]): 2185 runs - 8.70 us/run - 15360 kB/run - 1683.77 GB/s
DIV(type=f32,ne=[1,1,1920,1],nr=[32,32,1,1]): 1457 runs - 10.50 us/run - 23040 kB/run - 2092.84 GB/s
DIV(type=f32,ne=[1,1,640,1],nr=[32,32,1,1]): 4370 runs - 7.77 us/run - 7680 kB/run - 943.01 GB/s
DIV(type=f32,ne=[5120,1,1,1],nr=[1,256,1,1]): 2185 runs - 9.16 us/run - 15360 kB/run - 1598.50 GB/s
DIV(type=f32,ne=[640,1,1,1],nr=[1,1,1,1]): 8192 runs - 7.82 us/run - 7 kB/run - 0.91 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
GELU(type=f32,ne=[128,10,10,10]): 8192 runs - 166.72 us/run - 1000 kB/run - 5.72 GB/s
GELU(type=f32,ne=[7,13,19,23]): 8192 runs - 48.31 us/run - 310 kB/run - 6.13 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
GELU(type=f32,ne=[128,10,10,10]): 8192 runs - 11.09 us/run - 1000 kB/run - 86.01 GB/s
GELU(type=f32,ne=[7,13,19,23]): 8192 runs - 11.08 us/run - 310 kB/run - 26.74 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
SILU(type=f32,ne=[128,10,10,10]): 8192 runs - 23.57 us/run - 1000 kB/run - 40.46 GB/s
SILU(type=f32,ne=[7,13,19,23]): 8192 runs - 13.51 us/run - 310 kB/run - 21.93 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
SILU(type=f32,ne=[128,10,10,10]): 8192 runs - 11.09 us/run - 1000 kB/run - 85.98 GB/s
SILU(type=f32,ne=[7,13,19,23]): 8192 runs - 11.08 us/run - 310 kB/run - 26.73 GB/s
Backend CANN0: OK
实际上调用的是GELU算子,未实GELU_QUICK。
Backend 1/1 (CPU)
Backend name: CPU
GELU_QUICK(type=f32,ne=[128,10,10,10]): 8192 runs - 31.43 us/run - 1000 kB/run - 30.35 GB/s
GELU_QUICK(type=f32,ne=[7,13,19,23]): 8192 runs - 13.40 us/run - 310 kB/run - 22.10 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
GELU_QUICK(type=f32,ne=[128,10,10,10]): 8192 runs - 11.26 us/run - 1000 kB/run - 84.71 GB/s
GELU_QUICK(type=f32,ne=[7,13,19,23]): 8192 runs - 11.07 us/run - 310 kB/run - 26.77 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
TANH(type=f32,ne=[128,10,10,10]): 8192 runs - 1118.90 us/run - 1000 kB/run - 0.85 GB/s
TANH(type=f32,ne=[7,13,19,23]): 8192 runs - 391.71 us/run - 310 kB/run - 0.76 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
TANH(type=f32,ne=[128,10,10,10]): 8192 runs - 7.10 us/run - 1000 kB/run - 134.32 GB/s
TANH(type=f32,ne=[7,13,19,23]): 8192 runs - 7.16 us/run - 310 kB/run - 41.39 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
RELU(type=f32,ne=[128,10,10,10]): 8192 runs - 14.06 us/run - 1000 kB/run - 67.82 GB/s
RELU(type=f32,ne=[7,13,19,23]): 8192 runs - 185.79 us/run - 310 kB/run - 1.59 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
RELU(type=f32,ne=[128,10,10,10]): 8192 runs - 7.01 us/run - 1000 kB/run - 136.13 GB/s
RELU(type=f32,ne=[7,13,19,23]): 8192 runs - 7.57 us/run - 310 kB/run - 39.15 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
HARDSIGMOID(type=f32,ne=[128,10,10,10]): 8192 runs - 461.39 us/run - 1000 kB/run - 2.07 GB/s
HARDSIGMOID(type=f32,ne=[7,13,19,23]): 8192 runs - 145.02 us/run - 310 kB/run - 2.04 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
HARDSIGMOID(type=f32,ne=[128,10,10,10]): 8192 runs - 7.44 us/run - 1000 kB/run - 128.11 GB/s
HARDSIGMOID(type=f32,ne=[7,13,19,23]): 8192 runs - 7.38 us/run - 310 kB/run - 40.16 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
HARDSWISH(type=f32,ne=[128,10,10,10]): 8192 runs - 463.49 us/run - 1000 kB/run - 2.06 GB/s
HARDSWISH(type=f32,ne=[7,13,19,23]): 8192 runs - 146.70 us/run - 310 kB/run - 2.02 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
HARDSWISH(type=f32,ne=[128,10,10,10]): 8192 runs - 7.09 us/run - 1000 kB/run - 134.48 GB/s
HARDSWISH(type=f32,ne=[7,13,19,23]): 8192 runs - 7.12 us/run - 310 kB/run - 41.63 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
NORM(type=f32,ne=[64,10,10,10],eps=0.000001): 8192 runs - 53.92 us/run - 500 kB/run - 8.84 GB/s
NORM(type=f32,ne=[64,10,10,10],eps=0.000010): 8192 runs - 53.88 us/run - 500 kB/run - 8.85 GB/s
NORM(type=f32,ne=[64,10,10,10],eps=0.001000): 8192 runs - 53.85 us/run - 500 kB/run - 8.86 GB/s
NORM(type=f32,ne=[64,10,10,10],eps=0.100000): 8192 runs - 54.25 us/run - 500 kB/run - 8.79 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
NORM(type=f32,ne=[64,10,10,10],eps=0.000001): 8192 runs - 20.17 us/run - 500 kB/run - 23.64 GB/s
NORM(type=f32,ne=[64,10,10,10],eps=0.000010): 8192 runs - 19.72 us/run - 500 kB/run - 24.17 GB/s
NORM(type=f32,ne=[64,10,10,10],eps=0.001000): 8192 runs - 19.74 us/run - 500 kB/run - 24.16 GB/s
NORM(type=f32,ne=[64,10,10,10],eps=0.100000): 8192 runs - 19.68 us/run - 500 kB/run - 24.23 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
GROUP_NORM(type=f32,ne=[64,64,320,1],num_groups=32): 820 runs - 621.33 us/run - 10240 kB/run - 15.72 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
GROUP_NORM(type=f32,ne=[64,64,320,1],num_groups=32): 3277 runs - 24.46 us/run - 10240 kB/run - 399.22 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
UPSCALE(type=f32,ne=[512,512,3,1],scale_factor=2): 547 runs - 3120.34 us/run - 15360 kB/run - 4.69 GB/s
Backend CPU: OK
perf test 在NPU上执行异常
The error from device(chipId:0, dieId:0), serial number is 472, there is an aivec error exception, core id is 20, error code = 0x800000, dump info: pc start: 0x1240c005e5d8, current: 0x1240c005f43c, vec error info: 0xea1159162f, mte error info: 0x7d060000f8, ifu error info: 0x2d93518c01f80, ccu error info: 0x8ce05b7e7b4f1703, cube error info: 0, biu error info: 0, aic error mask: 0x6500020bd000288, para base: 0x1241000ed000.[FUNC:ProcessStarsCoreErrorInfo][FILE:device_error_proc.cc][LINE:1169]
Backend 1/1 (CPU)
Backend name: CPU
PAD(type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1): 4089 runs - 101.08 us/run - 2052 kB/run - 19.36 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
PAD(type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1): 8192 runs - 23.06 us/run - 2052 kB/run - 84.86 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
LEAKY_RELU(type=f32,ne_a=[10,10,10,10],negative_slope=0.100000): 8192 runs - 8.35 us/run - 78 kB/run - 8.92 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
LEAKY_RELU(type=f32,ne_a=[10,10,10,10],negative_slope=0.100000): 8192 runs - 7.60 us/run - 78 kB/run - 9.80 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
SCALE(type=f32,ne=[10,10,10,10],scale=2.000000): 8192 runs - 9.93 us/run - 78 kB/run - 7.50 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
SCALE(type=f32,ne=[10,10,10,10],scale=2.000000): 8192 runs - 7.32 us/run - 78 kB/run - 10.18 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
CLAMP(type=f32,ne=[10,10,10,10],min=-0.500000,max=0.500000): 8192 runs - 10.59 us/run - 78 kB/run - 7.03 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
CLAMP(type=f32,ne=[10,10,10,10],min=-0.500000,max=0.500000): 8192 runs - 7.38 us/run - 78 kB/run - 10.09 GB/s
Backend CANN0: OK
Backend 1/1 (CPU)
Backend name: CPU
ARGSORT(type=f32,ne=[8,1,1,1],order=0): 8192 runs - 1.73 us/run - 0 kB/run - 0.03 GB/s
ARGSORT(type=f32,ne=[16,10,10,10],order=0): 8192 runs - 330.08 us/run - 125 kB/run - 0.36 GB/s
ARGSORT(type=f32,ne=[60,10,10,10],order=0): 8192 runs - 1717.51 us/run - 468 kB/run - 0.26 GB/s
ARGSORT(type=f32,ne=[8,1,1,1],order=1): 8192 runs - 1.21 us/run - 0 kB/run - 0.05 GB/s
ARGSORT(type=f32,ne=[16,10,10,10],order=1): 8192 runs - 334.36 us/run - 125 kB/run - 0.36 GB/s
ARGSORT(type=f32,ne=[60,10,10,10],order=1): 8192 runs - 1671.17 us/run - 468 kB/run - 0.27 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
ARGSORT(type=f32,ne=[8,1,1,1],order=0): 8192 runs - 31.20 us/run - 0 kB/run - 0.00 GB/s
ARGSORT(type=f32,ne=[16,10,10,10],order=0): 8192 runs - 37.65 us/run - 125 kB/run - 3.17 GB/s
ARGSORT(type=f32,ne=[60,10,10,10],order=0): 8192 runs - 32.66 us/run - 468 kB/run - 13.69 GB/s
ARGSORT(type=f32,ne=[8,1,1,1],order=1): 8192 runs - 30.82 us/run - 0 kB/run - 0.00 GB/s
ARGSORT(type=f32,ne=[16,10,10,10],order=1): 8192 runs - 37.14 us/run - 125 kB/run - 3.21 GB/s
ARGSORT(type=f32,ne=[60,10,10,10],order=1): 8192 runs - 33.49 us/run - 468 kB/run - 13.35 GB/s
Backend CANN0: OK
执行慢的原因:算子本身执行慢。
Backend 1/1 (CPU)
Backend name: CPU
CONCAT(type=f32,ne=[10,10,10,10],b_ne2=10): 8192 runs - 4.32 us/run - 156 kB/run - 34.48 GB/s
CONCAT(type=i32,ne=[10,10,10,10],b_ne2=10): 8192 runs - 4.39 us/run - 156 kB/run - 33.91 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
CONCAT(type=f32,ne=[10,10,10,10],b_ne2=10): 8192 runs - 8.53 us/run - 156 kB/run - 17.47 GB/s
CONCAT(type=i32,ne=[10,10,10,10],b_ne2=10): 8192 runs - 8.22 us/run - 156 kB/run - 18.12 GB/s
Backend CANN0: OK
执行慢的原因:算子本身执行慢。
Backend 1/1 (CPU)
Backend name: CPU
REPEAT(type=f32,ne=[10,10,10,10],nr=[1,1,1,1]): 8192 runs - 5.38 us/run - 78 kB/run - 13.85 GB/s
REPEAT(type=f32,ne=[10,10,10,10],nr=[2,1,1,1]): 8192 runs - 8.08 us/run - 156 kB/run - 18.45 GB/s
REPEAT(type=f32,ne=[10,10,10,10],nr=[1,2,1,1]): 8192 runs - 10.64 us/run - 156 kB/run - 14.00 GB/s
REPEAT(type=f32,ne=[10,10,10,10],nr=[1,1,2,1]): 8192 runs - 10.85 us/run - 156 kB/run - 13.73 GB/s
REPEAT(type=f32,ne=[10,10,10,10],nr=[1,1,1,2]): 8192 runs - 10.83 us/run - 156 kB/run - 13.76 GB/s
REPEAT(type=i32,ne=[10,10,10,10],nr=[2,1,1,1]): 8192 runs - 8.12 us/run - 156 kB/run - 18.35 GB/s
REPEAT(type=i16,ne=[10,10,10,10],nr=[1,1,1,2]): 8192 runs - 21.40 us/run - 78 kB/run - 3.48 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
REPEAT(type=f32,ne=[10,10,10,10],nr=[1,1,1,1]): 8192 runs - 62.14 us/run - 78 kB/run - 1.20 GB/s
REPEAT(type=f32,ne=[10,10,10,10],nr=[2,1,1,1]): 8192 runs - 102.18 us/run - 156 kB/run - 1.46 GB/s
REPEAT(type=f32,ne=[10,10,10,10],nr=[1,2,1,1]): 8192 runs - 98.47 us/run - 156 kB/run - 1.51 GB/s
REPEAT(type=f32,ne=[10,10,10,10],nr=[1,1,2,1]): 8192 runs - 101.23 us/run - 156 kB/run - 1.47 GB/s
REPEAT(type=f32,ne=[10,10,10,10],nr=[1,1,1,2]): 8192 runs - 101.48 us/run - 156 kB/run - 1.47 GB/s
REPEAT(type=i32,ne=[10,10,10,10],nr=[2,1,1,1]): 8192 runs - 94.11 us/run - 156 kB/run - 1.58 GB/s
REPEAT(type=i16,ne=[10,10,10,10],nr=[1,1,1,2]): 8192 runs - 86.27 us/run - 78 kB/run - 0.86 GB/s
Backend CANN0: OK
执行慢的原因:算子本身执行慢。
Backend 1/1 (CPU)
Backend name: CPU
ARANGE(type=f32,start=0.000000,stop=10.000000,step=1.000000): 8192 runs - 1.65 us/run - 0 kB/run - 0.02 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
ARANGE(type=f32,start=0.000000,stop=10.000000,step=1.000000): 8192 runs - 7.05 us/run - 0 kB/run - 0.01 GB/s
Backend CANN0: OK
执行慢的原因:调用了多个算子组合,需要融合算子
Backend 1/1 (CPU)
Backend name: CPU
TIMESTEP_EMBEDDING(type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000): 8192 runs - 9.87 us/run - 2 kB/run - 0.24 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
TIMESTEP_EMBEDDING(type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000): 8192 runs - 81.09 us/run - 2 kB/run - 0.03 GB/s
Backend CANN0: OK
执行慢的原因:gamma是必传参数,需要构造全1矩阵,rstd输出是必传参数,需要申请设备上内存。
Backend 1/1 (CPU)
Backend name: CPU
RMS_NORM(type=f32,ne=[64,10,10,10],eps=0.000001): 8192 runs - 31.75 us/run - 500 kB/run - 15.02 GB/s
RMS_NORM(type=f32,ne=[64,10,10,10],eps=0.000010): 8192 runs - 31.93 us/run - 500 kB/run - 14.93 GB/s
RMS_NORM(type=f32,ne=[64,10,10,10],eps=0.001000): 8192 runs - 32.23 us/run - 500 kB/run - 14.80 GB/s
RMS_NORM(type=f32,ne=[64,10,10,10],eps=0.100000): 8192 runs - 31.99 us/run - 500 kB/run - 14.90 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
RMS_NORM(type=f32,ne=[64,10,10,10],eps=0.000001): 8192 runs - 78.59 us/run - 500 kB/run - 6.07 GB/s
RMS_NORM(type=f32,ne=[64,10,10,10],eps=0.000010): 8192 runs - 76.18 us/run - 500 kB/run - 6.26 GB/s
RMS_NORM(type=f32,ne=[64,10,10,10],eps=0.001000): 8192 runs - 76.55 us/run - 500 kB/run - 6.23 GB/s
RMS_NORM(type=f32,ne=[64,10,10,10],eps=0.100000): 8192 runs - 75.31 us/run - 500 kB/run - 6.33 GB/s
Backend CANN0: OK
执行慢的原因:算子本身执行慢。
Backend 1/1 (CPU)
Backend name: CPU
SQR(type=f32,ne=[10,10,10,10]): 8192 runs - 2.94 us/run - 78 kB/run - 25.37 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
SQR(type=f32,ne=[10,10,10,10]): 8192 runs - 7.91 us/run - 117 kB/run - 14.13 GB/s
Backend CANN0: OK
执行慢的原因:算子本身执行慢。
Backend 1/1 (CPU)
Backend name: CPU
CONT(type=f32,ne=[10,10,10,1]): 8191 runs - 2.44 us/run - 7 kB/run - 3.05 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
CONT(type=f32,ne=[10,10,10,1]): 8191 runs - 12.55 us/run - 7 kB/run - 0.59 GB/s
Backend CANN0: OK
执行慢的原因:调用了多个算子组合,需要融合算子
Backend 1/1 (CPU)
Backend name: CPU
DIAG_MASK_INF(type=f32,ne=[10,10,1,1],n_past=5): 8192 runs - 1.59 us/run - 0 kB/run - 0.47 GB/s
DIAG_MASK_INF(type=f32,ne=[10,10,10,1],n_past=5): 8192 runs - 2.07 us/run - 7 kB/run - 3.60 GB/s
DIAG_MASK_INF(type=f32,ne=[10,10,10,10],n_past=5): 8192 runs - 5.20 us/run - 78 kB/run - 14.32 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
DIAG_MASK_INF(type=f32,ne=[10,10,1,1],n_past=5): 8192 runs - 59.52 us/run - 0 kB/run - 0.01 GB/s
DIAG_MASK_INF(type=f32,ne=[10,10,10,1],n_past=5): 8192 runs - 61.12 us/run - 7 kB/run - 0.12 GB/s
DIAG_MASK_INF(type=f32,ne=[10,10,10,10],n_past=5): 8192 runs - 91.92 us/run - 78 kB/run - 0.81 GB/s
Backend CANN0: OK
执行慢的原因:调用了多个算子组合,需要融合算子
Backend 1/1 (CPU)
Backend name: CPU
IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,3,1],ne_kernel=[3,3,3,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1): 8192 runs - 5.10 us/run - 11 kB/run - 2.20 GB/s
IM2COL(type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,3,1],ne_kernel=[3,3,3,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1): 8192 runs - 4.54 us/run - 6 kB/run - 1.36 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,3,1],ne_kernel=[3,3,3,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1): 8192 runs - 23.89 us/run - 11 kB/run - 0.47 GB/s
IM2COL(type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,3,1],ne_kernel=[3,3,3,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1): 8192 runs - 35.48 us/run - 6 kB/run - 0.17 GB/s
Backend CANN0: OK
执行慢的原因:max pool 调用了多个算子组合,需要融合算子,算子本身也慢
Backend 1/1 (CPU) Backend name: CPU
POOL_2D(pool_type=avg,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=0,p1=0): 8192 runs - 1.12 us/run - 2 kB/run - 1.99 GB/s
POOL_2D(pool_type=avg,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=0,p1=1): 8192 runs - 1.31 us/run - 2 kB/run - 1.88 GB/s
POOL_2D(pool_type=avg,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=1,p1=0): 8192 runs - 1.30 us/run - 2 kB/run - 1.88 GB/s
POOL_2D(pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=0,p1=0): 8192 runs - 1.33 us/run - 2 kB/run - 1.69 GB/s
POOL_2D(pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=0,p1=1): 8192 runs - 1.49 us/run - 2 kB/run - 1.65 GB/s
POOL_2D(pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=1,p1=0): 8192 runs - 1.49 us/run - 2 kB/run - 1.65 GB/s
... ...
Backend 2/2 (CANN0) Backend name: CANN0
POOL_2D(pool_type=avg,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=0,p1=0): 8192 runs - 77.98 us/run - 2 kB/run - 0.03 GB/s
POOL_2D(pool_type=avg,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=0,p1=1): 8192 runs - 76.85 us/run - 2 kB/run - 0.03 GB/s
POOL_2D(pool_type=avg,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=1,p1=0): 8192 runs - 78.23 us/run - 2 kB/run - 0.03 GB/s
POOL_2D(pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=0,p1=0): 8192 runs - 37.53 us/run - 2 kB/run - 0.06 GB/s
POOL_2D(pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=0,p1=1): 8192 runs - 41.14 us/run - 2 kB/run - 0.06 GB/s
POOL_2D(pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=1,k1=1,s0=1,s1=1,p0=1,p1=0): 8192 runs - 43.39 us/run - 2 kB/run - 0.06 GB/s
... ...
执行慢的原因:算子本身执行慢。
Backend 1/1 (CPU)
Backend name: CPU
SUM_ROWS(type=f32,ne=[10,10,10,10]): 8192 runs - 4.85 us/run - 42 kB/run - 8.44 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
SUM_ROWS(type=f32,ne=[10,10,10,10]): 8192 runs - 11.48 us/run - 42 kB/run - 3.57 GB/s
Backend CANN0: OK
执行慢的原因:算子本身执行慢。
Backend 1/1 (CPU)
Backend name: CPU
DUP(type=f32,ne=[10,10,10,1]): 8192 runs - 1.71 us/run - 7 kB/run - 4.35 GB/s
DUP(type=f16,ne=[10,10,10,1]): 8192 runs - 1.63 us/run - 3 kB/run - 2.29 GB/s
DUP(type=i32,ne=[10,10,10,1]): 8192 runs - 1.67 us/run - 7 kB/run - 4.46 GB/s
DUP(type=i16,ne=[10,10,10,1]): 8192 runs - 1.63 us/run - 3 kB/run - 2.29 GB/s
DUP(type=i16,ne=[10,8,3,1],permute=[0,2,1,3]): 8191 runs - 1.89 us/run - 0 kB/run - 0.47 GB/s
DUP(type=i16,ne=[10,8,3,1],permute=[1,2,0,3]): 8191 runs - 2.09 us/run - 0 kB/run - 0.43 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
DUP(type=f32,ne=[10,10,10,1]): 8192 runs - 10.04 us/run - 7 kB/run - 0.74 GB/s
DUP(type=f16,ne=[10,10,10,1]): 8192 runs - 10.02 us/run - 3 kB/run - 0.37 GB/s
DUP(type=i32,ne=[10,10,10,1]): 8192 runs - 10.07 us/run - 7 kB/run - 0.74 GB/s
DUP(type=i16,ne=[10,10,10,1]): 8192 runs - 10.04 us/run - 3 kB/run - 0.37 GB/s
DUP(type=i16,ne=[10,8,3,1],permute=[0,2,1,3]): 8191 runs - 26.69 us/run - 0 kB/run - 0.03 GB/s
DUP(type=i16,ne=[10,8,3,1],permute=[1,2,0,3]): 8191 runs - 26.70 us/run - 0 kB/run - 0.03 GB/s
Backend CANN0: OK
执行慢的原因:max pool 调用了多个算子组合,需要融合算子
Backend 1/1 (CPU)
Backend name: CPU
ALIBI(type=f32,ne_a=[16,2,10,1],n_past=0,n_head=10,bias_max=-0.500000): 8192 runs - 0.38 us/run - 2 kB/run - 6.29 GB/s
ALIBI(type=f32,ne_a=[16,2,32,1],n_past=0,n_head=32,bias_max=-0.500000): 8192 runs - 1.08 us/run - 8 kB/run - 7.08 GB/s
ALIBI(type=f32,ne_a=[128,4,10,1],n_past=0,n_head=10,bias_max=-0.500000): 8192 runs - 4.29 us/run - 40 kB/run - 8.90 GB/s
ALIBI(type=f32,ne_a=[128,4,32,1],n_past=0,n_head=32,bias_max=-0.500000): 8192 runs - 13.77 us/run - 128 kB/run - 8.87 GB/s
ALIBI(type=f32,ne_a=[16,2,10,1],n_past=0,n_head=10,bias_max=0.500000): 8192 runs - 0.38 us/run - 2 kB/run - 6.30 GB/s
ALIBI(type=f32,ne_a=[16,2,32,1],n_past=0,n_head=32,bias_max=0.500000): 8192 runs - 1.07 us/run - 8 kB/run - 7.10 GB/s
ALIBI(type=f32,ne_a=[128,4,10,1],n_past=0,n_head=10,bias_max=0.500000): 8192 runs - 4.29 us/run - 40 kB/run - 8.88 GB/s
ALIBI(type=f32,ne_a=[128,4,32,1],n_past=0,n_head=32,bias_max=0.500000): 8192 runs - 13.73 us/run - 128 kB/run - 8.89 GB/s
Backend CPU: OK
Backend 2/2 (CANN0)
Backend name: CANN0
ALIBI(type=f32,ne_a=[16,2,10,1],n_past=0,n_head=10,bias_max=-0.500000): 8192 runs - 94.75 us/run - 2 kB/run - 0.03 GB/s
ALIBI(type=f32,ne_a=[16,2,32,1],n_past=0,n_head=32,bias_max=-0.500000): 8192 runs - 78.96 us/run - 8 kB/run - 0.10 GB/s
ALIBI(type=f32,ne_a=[128,4,10,1],n_past=0,n_head=10,bias_max=-0.500000): 8192 runs - 99.79 us/run - 40 kB/run - 0.38 GB/s
ALIBI(type=f32,ne_a=[128,4,32,1],n_past=0,n_head=32,bias_max=-0.500000): 8192 runs - 81.76 us/run - 128 kB/run - 1.49 GB/s
ALIBI(type=f32,ne_a=[16,2,10,1],n_past=0,n_head=10,bias_max=0.500000): 8192 runs - 98.16 us/run - 2 kB/run - 0.02 GB/s
ALIBI(type=f32,ne_a=[16,2,32,1],n_past=0,n_head=32,bias_max=0.500000): 8192 runs - 80.33 us/run - 8 kB/run - 0.09 GB/s
ALIBI(type=f32,ne_a=[128,4,10,1],n_past=0,n_head=10,bias_max=0.500000): 8192 runs - 100.40 us/run - 40 kB/run - 0.38 GB/s
ALIBI(type=f32,ne_a=[128,4,32,1],n_past=0,n_head=32,bias_max=0.500000): 8192 runs - 81.56 us/run - 128 kB/run - 1.50 GB/s
Backend CANN0: OK
[toc]
量化分组格式
#define QK4_0 32 // 每组32个f32数据
typedef struct {
ggml_half d; // 公共系数
uint8_t qs[QK4_0 / 2]; // 4bit存储的数据
} block_q4_0;
量化算法描述
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
static const int qk = QK4_0;
assert(k % qk == 0);
const int nb = k / qk;
// 1. 找到绝对值最大的数的值
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
float max = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = x[i*qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
// 2. 公共系数是第一步的值除以 -8
const float d = max / -8;
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
// 3. 对组内的所有数据,除以公共系数,然后按以下顺序存储
// 量化前: 1,2,3,4,5,6,7,8 ...... 30,31
// 量化后: 1,16,2,17,3,18 ...... 15,31
// 也就是数据按顺序先填充量化后组的低4位,然后再填充高4位。
for (int j = 0; j < qk/2; ++j) {
const float x0 = x[i*qk + 0 + j]*id;
const float x1 = x[i*qk + qk/2 + j]*id;
// 量化的值+8.5,溢出后取最大值。
const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
y[i].qs[j] = xi0;
y[i].qs[j] |= xi1 << 4;
}
}
}
反量化算法描述
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
static const int qk = QK4_0;
assert(k % qk == 0);
const int nb = k / qk;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d);
// 1. 量化后的4bit数据-8
for (int j = 0; j < qk/2; ++j) {
const int x0 = (x[i].qs[j] & 0x0F) - 8;
const int x1 = (x[i].qs[j] >> 4) - 8;
// 2. 数据乘以公共系数后按以下顺序存储:
// 反量化前: 1,16,2,17,3,18 ...... 15,31
// 反量化后: 1,2,3,4,5,6,7,8 ...... 30,31
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
}
}
}
量化分组格式
#define QK4_1 32
typedef struct {
union {
struct {
ggml_half d; // 公共系数
ggml_half m; // 公共偏移
} GGML_COMMON_AGGR;
ggml_half2 dm;
};
uint8_t qs[QK4_1 / 2];
量化分组格式
#define QK5_0 32
typedef struct {
ggml_half d; // 公共系数
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2];
} block_q5_0;
量化算法和反量化算法与Q4_0基本一致,额外记录了每个数据的第五个bit位的值,在反量化的时候恢复。
量化分组格式
#define QK5_1 32
typedef struct {
union {
struct {
ggml_half d; // delta
ggml_half m; // min
} GGML_COMMON_AGGR;
ggml_half2 dm;
};
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
#define QK8_0 32
typedef struct {
ggml_half d; // delta
int8_t qs[QK8_0];
} block_q8_0;
量化算法和反量化算法与Q4_0基本一致,区别是q8_0使用8位记录一个数据
输入2个矩阵(src0:量化后的tensor, src1:f32或f16),量化后的数据直接相乘,结果是float类型。
支持多个Moe专家一起计算,计算方法与GGML_OP_MUL_MAT类似。
该算子实际上做了Embedding操作,但是允许输入的tensor是量化的tensor。输出F32的tensor。
Tensor拷贝,允许从F16或者F32的tensor拷贝到一个量化的tensor中。
aclnn只能适配部分场景的算子: UPSCALE:需要支持非整数倍scale CONCAT:需要支持维度不同的tensor做concat DUP:需要支持量化,非连续,不同shape,不同数据类型之间做拷贝 ROPE:不支持n_dims < ne0的场景,不支持fs,ef等参数为非默认值场景,且算子复杂,应考虑ascendc方式实现 MUL_MAT:量化相乘不支持batch
算法有差异的算子: ARGSORT:排序使用了不稳定算法(千问在用)
不支持的算子 flash_attention:能提速 mul_mat_id:MOE需要使用 sigmoid: 新增算子,实现简单 conv_transpose_1d: 新增算子
TOP:
MUl_MAT ROPE SOFTMAX(alibi)
excellent work! hope to get access and try it myself soon!