Describe the bug
sycl::vec "as" operation usage leads to significant performance drop. Is this expected?
Have to use reinterpret_cast workaround to get expected performance back.
To Reproduce
Test program:
# cat as.cpp
#include <sycl.hpp>
#include <chrono>
const unsigned N = 1024 * 1024;
sycl::float4 Mem[N] = { sycl::float4(0) };
// Kernel as1 execution time: 17.643s
inline sycl::float4 as1(sycl::float4 a, const uint32_t b) {
return (a.as<sycl::uint4>() & b).as<sycl::float4>();
}
// Kernel as2 execution time: 0.052s
inline sycl::float4 as2(const sycl::float4 a, const uint32_t b) {
const sycl::uint4 i = reinterpret_cast<const sycl::uint4&>(a) & b;
return reinterpret_cast<const sycl::float4&>(i);
}
typedef sycl::float4 (*FOO)(sycl::float4, uint32_t);
template<FOO foo> void do_sycl(const char* const name) {
try {
auto exception_handler = [] (sycl::exception_list exceptions) {
for (std::exception_ptr const& e : exceptions) {
try {
std::rethrow_exception(e);
} catch(sycl::exception const& e) {
std::cerr << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl;
}
}
};
auto q = sycl::queue{sycl::gpu_selector_v, exception_handler};
auto bMem = sycl::buffer(Mem, sycl::range(N));
// compile kernels
auto kb_begin = std::chrono::high_resolution_clock::now();
auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
{ auto kb_end = std::chrono::high_resolution_clock::now();
auto kb_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(kb_end - kb_begin);
printf("Kernel %s compile time: %.3fs\n", name, kb_elapsed.count() * 1e-9);
}
auto begin = std::chrono::high_resolution_clock::now();
Mem[0] = 1;
q.submit([&](sycl::handler& h) {
auto mem = bMem.get_access<sycl::access::mode::read_write>(h);
h.use_kernel_bundle(kb);
h.parallel_for(sycl::range(N), [=](sycl::id<1> n) {
for (int i = 0; i != 1000*1000; ++ i)
mem[n] = foo(mem[n], 0x3F400000);
});
});
q.wait_and_throw();
{ auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Kernel %s execution time: %.3fs\n", name, elapsed.count() * 1e-9);
}
} catch (sycl::exception const& e) {
std::cerr << "Caught synchronous SYCL exception:\n" << e.what() << std::endl;
}
}
int main() {
do_sycl<as1>("as1");
if (Mem[0][0] != 0.5f) {
printf("FAILED: %f (%X)\n", Mem[0][0], *(unsigned*)Mem);
return 1;
}
do_sycl<as2>("as2");
if (Mem[0][0] != 0.5f) {
printf("FAILED: %f (%X)\n", Mem[0][0], *(unsigned*)Mem);
return 1;
}
puts("PASSED");
return 0;
}
Describe the bug sycl::vec "as" operation usage leads to significant performance drop. Is this expected? Have to use reinterpret_cast workaround to get expected performance back.
To Reproduce
Test program:
Test program output:
Environment (please complete the following information):