Open wenju-he opened 1 year ago
This example would likely benefit from defining more SYCL runtime functions in the SYCL headers as opposed to in the runtime binary.
If the compiler could inline the following three called functions (and their callees)
sycl::_V1::detail::AccessorBaseHost::getMemoryRange() const;
sycl::_V1::detail::AccessorBaseHost::getPtr() const;
sycl::_V1::detail::AccessorBaseHost::getOffset() const;
the generated code would look more like the single_task
kernel .
Describe the performance issue For small workloads, its performance isn't as good as single_task on native-cpu target due to overhead of sycl runtime api calls. Since SYCL host_task is managed and scheduled by sycl runtime, the performance issue impacts all devices.
To Reproduce
sycl::event foo_single_task(sycl::queue &q, sycl::buffer &r, size_t n) {
return q.submit([&](sycl::handler &cgh) {
sycl::accessor acc{r, cgh, sycl::write_only, sycl::no_init};
cgh.single_task([=]() {
for (size_t i = 0; i < n; i++) {
acc[i] = i;
}
});
});
}
sycl::event foo_host_task(sycl::queue &q, sycl::buffer &r, size_t n) {
return q.submit([&](sycl::handler &cgh) {
sycl::accessor acc{r, cgh, sycl::write_only_host_task, sycl::no_init};
cgh.host_task([=]() {
for (size_t i = 0; i < n; i++) {
acc[i] = i;
}
});
});
}
int main() { sycl::queue q; std::cout << "Running on " << q.get_device().get_info() << std::endl;
int n_run = 10000; size_t n = 1000000; sycl::event event;
std::cout << "n = " << n << std::endl; { auto start = std::chrono::steady_clock::now(); sycl::buffer r(n);
for (int i = 0; i < n_run; i++) {
event = foo_single_task(q, r, n);
event.wait();
}
auto end = std::chrono::steady_clock::now();
std::cout << "single_task " << (std::chrono::duration_cast(end - start).count() 1e-06) << "ms" << std::endl;
}
{
auto start = std::chrono::steady_clock::now();
sycl::buffer r(n);
for (int i = 0; i < n_run; i++) {
event = foo_host_task(q, r, n);
event.wait();
}
auto end = std::chrono::steady_clock::now();
std::cout << "host_task " << (std::chrono::duration_cast(end - start).count() 1e-06) << "ms" << std::endl;
}
return 0; }
clang++ -fsycl -fsycl-targets=native_cpu -O2 test.cpp
ONEAPI_DEVICE_SELECTOR=native_cpu:cpu ./a.out
Running on SYCL Native CPU n = 1000000 single_task 6148.84ms host_task 42818.1ms
0000000000405f70 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data>: 405f70: 41 57 push %r15 405f72: 41 56 push %r14 405f74: 41 54 push %r12 405f76: 53 push %rbx 405f77: 50 push %rax 405f78: 4c 8b 37 mov (%rdi),%r14 405f7b: 49 83 3e 00 cmpq $0x0,(%r14) 405f7f: 74 72 je 405ff3 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x83> 405f81: 49 8d 5e 08 lea 0x8(%r14),%rbx 405f85: 45 31 ff xor %r15d,%r15d 405f88: eb 47 jmp 405fd1 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x61> 405f8a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 405f90: 0f 57 c0 xorps %xmm0,%xmm0 405f93: f3 49 0f 2a c7 cvtsi2ss %r15,%xmm0 405f98: f3 0f 11 44 24 04 movss %xmm0,0x4(%rsp) 405f9e: 48 89 df mov %rbx,%rdi 405fa1: e8 0a d4 ff ff callq 4033b0 _ZNK4sycl3_V16detail16AccessorBaseHost14getMemoryRangeEv@plt 405fa6: 48 89 df mov %rbx,%rdi 405fa9: e8 82 d3 ff ff callq 403330 _ZNK4sycl3_V16detail16AccessorBaseHost9getOffsetEv@plt 405fae: 4c 8b 20 mov (%rax),%r12 405fb1: 48 89 df mov %rbx,%rdi 405fb4: e8 37 d1 ff ff callq 4030f0 _ZNK4sycl3_V16detail16AccessorBaseHost6getPtrEv@plt 405fb9: 4a 8d 04 a0 lea (%rax,%r12,4),%rax 405fbd: f3 0f 10 44 24 04 movss 0x4(%rsp),%xmm0 405fc3: f3 42 0f 11 04 b8 movss %xmm0,(%rax,%r15,4) 405fc9: 49 ff c7 inc %r15 405fcc: 4d 3b 3e cmp (%r14),%r15 405fcf: 73 22 jae 405ff3 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x83> 405fd1: 4d 85 ff test %r15,%r15 405fd4: 79 ba jns 405f90 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x20> 405fd6: 4c 89 f8 mov %r15,%rax 405fd9: 48 d1 e8 shr %rax 405fdc: 44 89 f9 mov %r15d,%ecx 405fdf: 83 e1 01 and $0x1,%ecx 405fe2: 48 09 c1 or %rax,%rcx 405fe5: 0f 57 c0 xorps %xmm0,%xmm0 405fe8: f3 48 0f 2a c1 cvtsi2ss %rcx,%xmm0 405fed: f3 0f 58 c0 addss %xmm0,%xmm0 405ff1: eb a5 jmp 405f98 <_ZNSt17_Function_handlerIFvvEZZ13foo_host_taskRN4sycl3_V15queueERNS2_6bufferIfLi1ENS2_6detail17aligned_allocatorIfEEvEEmENKUlRNS2_7handlerEE_clESC_EUlvE_E9_M_invokeERKSt9_Any_data+0x28> 405ff3: 48 83 c4 08 add $0x8,%rsp 405ff7: 5b pop %rbx 405ff8: 41 5c pop %r12 405ffa: 41 5e pop %r14 405ffc: 41 5f pop %r15 405ffe: c3 retq 405fff: 90 nop
0000000000408850 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESA_EUlvE_.NativeCPUKernel.SYCLNCPU>: 408850: 48 8b 02 mov (%rdx),%rax 408853: 48 8d 04 86 lea (%rsi,%rax,4),%rax 408857: 31 c9 xor %ecx,%ecx 408859: eb 18 jmp 408873 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESAEUlvE.NativeCPUKernel.SYCLNCPU+0x23> 40885b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 408860: 0f 57 c0 xorps %xmm0,%xmm0 408863: f3 48 0f 2a c1 cvtsi2ss %rcx,%xmm0 408868: f3 0f 11 00 movss %xmm0,(%rax) 40886c: 48 ff c1 inc %rcx 40886f: 48 83 c0 04 add $0x4,%rax 408873: 48 39 f9 cmp %rdi,%rcx 408876: 73 21 jae 408899 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESAEUlvE.NativeCPUKernel.SYCLNCPU+0x49> 408878: 48 85 c9 test %rcx,%rcx 40887b: 79 e3 jns 408860 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESAEUlvE.NativeCPUKernel.SYCLNCPU+0x10> 40887d: 48 89 ca mov %rcx,%rdx 408880: 48 d1 ea shr %rdx 408883: 89 ce mov %ecx,%esi 408885: 83 e6 01 and $0x1,%esi 408888: 48 09 d6 or %rdx,%rsi 40888b: 0f 57 c0 xorps %xmm0,%xmm0 40888e: f3 48 0f 2a c6 cvtsi2ss %rsi,%xmm0 408893: f3 0f 58 c0 addss %xmm0,%xmm0 408897: eb cf jmp 408868 <_ZTSZZ15foo_single_taskRN4sycl3_V15queueERNS0_6bufferIfLi1ENS0_6detail17aligned_allocatorIfEEvEEmENKUlRNS0_7handlerEE_clESAEUlvE.NativeCPUKernel.SYCLNCPU+0x18> 408899: c3 retq 40889a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)