intel / llvm

Intel staging area for llvm.org contribution. Home for Intel LLVM-based projects.
Other
1.25k stars 738 forks source link

[SYCL][Performance] host_task's performance isn't good on a small workload #11593

Open wenju-he opened 1 year ago

wenju-he commented 1 year ago

Describe the performance issue For small workloads, its performance isn't as good as single_task on native-cpu target due to overhead of sycl runtime api calls. Since SYCL host_task is managed and scheduled by sycl runtime, the performance issue impacts all devices.

To Reproduce

  1. test.cpp
    
    #include <sycl/sycl.hpp>
    #include <chrono>
    #include <iostream>
    #include <vector>

sycl::event foo_single_task(sycl::queue &q, sycl::buffer &r, size_t n) { return q.submit([&](sycl::handler &cgh) { sycl::accessor acc{r, cgh, sycl::write_only, sycl::no_init}; cgh.single_task([=]() { for (size_t i = 0; i < n; i++) { acc[i] = i; } }); }); }

sycl::event foo_host_task(sycl::queue &q, sycl::buffer &r, size_t n) { return q.submit([&](sycl::handler &cgh) { sycl::accessor acc{r, cgh, sycl::write_only_host_task, sycl::no_init}; cgh.host_task([=]() { for (size_t i = 0; i < n; i++) { acc[i] = i; } }); }); }

int main() { sycl::queue q; std::cout << "Running on " << q.get_device().get_info() << std::endl;

int n_run = 10000; size_t n = 1000000; sycl::event event;

std::cout << "n = " << n << std::endl; { auto start = std::chrono::steady_clock::now(); sycl::buffer r(n); for (int i = 0; i < n_run; i++) { event = foo_single_task(q, r, n); event.wait(); } auto end = std::chrono::steady_clock::now(); std::cout << "single_task " << (std::chrono::duration_cast(end - start).count() 1e-06) << "ms" << std::endl; } { auto start = std::chrono::steady_clock::now(); sycl::buffer r(n); for (int i = 0; i < n_run; i++) { event = foo_host_task(q, r, n); event.wait(); } auto end = std::chrono::steady_clock::now(); std::cout << "host_task " << (std::chrono::duration_cast(end - start).count() 1e-06) << "ms" << std::endl; }

return 0; }


2. Compile command

clang++ -fsycl -fsycl-targets=native_cpu -O2 test.cpp

3. Launch the program

ONEAPI_DEVICE_SELECTOR=native_cpu:cpu ./a.out

4. Output shows host_task is ~7 times slower than single_task

Running on SYCL Native CPU n = 1000000 single_task 6148.84ms host_task 42818.1ms


**Environment (please complete the following information):**

- OS: RHEL9.0
- Target device and vendor: native_cpu
- DPC++ version: clang version 18.0.0 (https://github.com/intel/llvm.git d17a3f16b84f75ca1ddbe512f2ad68b00db321f6)

**Additional context**
Assembly of host_task function shows there are calls to sycl runtime api inside the loop:

0000000000405f70 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data>: 405f70: 41 57 push %r15 405f72: 41 56 push %r14 405f74: 41 54 push %r12 405f76: 53 push %rbx 405f77: 50 push %rax 405f78: 4c 8b 37 mov (%rdi),%r14 405f7b: 49 83 3e 00 cmpq $0x0,(%r14) 405f7f: 74 72 je 405ff3 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x83> 405f81: 49 8d 5e 08 lea 0x8(%r14),%rbx 405f85: 45 31 ff xor %r15d,%r15d 405f88: eb 47 jmp 405fd1 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x61> 405f8a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 405f90: 0f 57 c0 xorps %xmm0,%xmm0 405f93: f3 49 0f 2a c7 cvtsi2ss %r15,%xmm0 405f98: f3 0f 11 44 24 04 movss %xmm0,0x4(%rsp) 405f9e: 48 89 df mov %rbx,%rdi 405fa1: e8 0a d4 ff ff callq 4033b0 _ZNK4sycl3_V16detail16AccessorBaseHost14getMemoryRangeEv@plt 405fa6: 48 89 df mov %rbx,%rdi 405fa9: e8 82 d3 ff ff callq 403330 _ZNK4sycl3_V16detail16AccessorBaseHost9getOffsetEv@plt 405fae: 4c 8b 20 mov (%rax),%r12 405fb1: 48 89 df mov %rbx,%rdi 405fb4: e8 37 d1 ff ff callq 4030f0 _ZNK4sycl3_V16detail16AccessorBaseHost6getPtrEv@plt 405fb9: 4a 8d 04 a0 lea (%rax,%r12,4),%rax 405fbd: f3 0f 10 44 24 04 movss 0x4(%rsp),%xmm0 405fc3: f3 42 0f 11 04 b8 movss %xmm0,(%rax,%r15,4) 405fc9: 49 ff c7 inc %r15 405fcc: 4d 3b 3e cmp (%r14),%r15 405fcf: 73 22 jae 405ff3 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x83> 405fd1: 4d 85 ff test %r15,%r15 405fd4: 79 ba jns 405f90 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x20> 405fd6: 4c 89 f8 mov %r15,%rax 405fd9: 48 d1 e8 shr %rax 405fdc: 44 89 f9 mov %r15d,%ecx 405fdf: 83 e1 01 and $0x1,%ecx 405fe2: 48 09 c1 or %rax,%rcx 405fe5: 0f 57 c0 xorps %xmm0,%xmm0 405fe8: f3 48 0f 2a c1 cvtsi2ss %rcx,%xmm0 405fed: f3 0f 58 c0 addss %xmm0,%xmm0 405ff1: eb a5 jmp 405f98 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x28> 405ff3: 48 83 c4 08 add $0x8,%rsp 405ff7: 5b pop %rbx 405ff8: 41 5c pop %r12 405ffa: 41 5e pop %r14 405ffc: 41 5f pop %r15 405ffe: c3 retq 405fff: 90 nop

On the other hand, single_task function on native-cpu device only contains a simple loop:

0000000000408850 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU>: 408850: 48 8b 02 mov (%rdx),%rax 408853: 48 8d 04 86 lea (%rsi,%rax,4),%rax 408857: 31 c9 xor %ecx,%ecx 408859: eb 18 jmp 408873 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESAEUlvE.NativeCPUKernel.SYCLNCPU+0x23> 40885b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 408860: 0f 57 c0 xorps %xmm0,%xmm0 408863: f3 48 0f 2a c1 cvtsi2ss %rcx,%xmm0 408868: f3 0f 11 00 movss %xmm0,(%rax) 40886c: 48 ff c1 inc %rcx 40886f: 48 83 c0 04 add $0x4,%rax 408873: 48 39 f9 cmp %rdi,%rcx 408876: 73 21 jae 408899 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESAEUlvE.NativeCPUKernel.SYCLNCPU+0x49> 408878: 48 85 c9 test %rcx,%rcx 40887b: 79 e3 jns 408860 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESAEUlvE.NativeCPUKernel.SYCLNCPU+0x10> 40887d: 48 89 ca mov %rcx,%rdx 408880: 48 d1 ea shr %rdx 408883: 89 ce mov %ecx,%esi 408885: 83 e6 01 and $0x1,%esi 408888: 48 09 d6 or %rdx,%rsi 40888b: 0f 57 c0 xorps %xmm0,%xmm0 40888e: f3 48 0f 2a c6 cvtsi2ss %rsi,%xmm0 408893: f3 0f 58 c0 addss %xmm0,%xmm0 408897: eb cf jmp 408868 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESAEUlvE.NativeCPUKernel.SYCLNCPU+0x18> 408899: c3 retq 40889a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)


vtune result:
<img width="647" alt="Screenshot 2023-10-19 092853" src="https://github.com/intel/llvm/assets/3746105/cad44d53-4719-45e5-9817-b66ceb1bf0bd">
<img width="1693" alt="Screenshot 2023-10-19 092742" src="https://github.com/intel/llvm/assets/3746105/00c9f3db-ccd2-4f54-bc29-733247ad10e3">
uwedolinsky commented 1 year ago

This example would likely benefit from defining more SYCL runtime functions in the SYCL headers as opposed to in the runtime binary.

If the compiler could inline the following three called functions (and their callees)

sycl::_V1::detail::AccessorBaseHost::getMemoryRange() const;
sycl::_V1::detail::AccessorBaseHost::getPtr() const;
sycl::_V1::detail::AccessorBaseHost::getOffset() const;

the generated code would look more like the single_task kernel .