sycl::vec "as" operation usage leads to significant performance drop

Describe the bug sycl::vec "as" operation usage leads to significant performance drop. Is this expected? Have to use reinterpret_cast workaround to get expected performance back.

To Reproduce

Test program:

# cat as.cpp
#include <sycl.hpp>
#include <chrono>

const unsigned N = 1024 * 1024;
sycl::float4 Mem[N] = { sycl::float4(0) };

// Kernel as1 execution time: 17.643s
inline sycl::float4 as1(sycl::float4 a, const uint32_t b) {
  return (a.as<sycl::uint4>() & b).as<sycl::float4>();
}

// Kernel as2 execution time: 0.052s
inline sycl::float4 as2(const sycl::float4 a, const uint32_t b) {
  const sycl::uint4 i = reinterpret_cast<const sycl::uint4&>(a) & b;
  return reinterpret_cast<const sycl::float4&>(i);
}

typedef sycl::float4 (*FOO)(sycl::float4, uint32_t);
template<FOO foo> void do_sycl(const char* const name) {
  try {
    auto exception_handler = [] (sycl::exception_list exceptions) {
      for (std::exception_ptr const& e : exceptions) {
        try {
          std::rethrow_exception(e);
        } catch(sycl::exception const& e) {
          std::cerr << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl;
        }
      }
    };
    auto q = sycl::queue{sycl::gpu_selector_v, exception_handler};

    auto bMem = sycl::buffer(Mem, sycl::range(N));

    // compile kernels
    auto kb_begin = std::chrono::high_resolution_clock::now();
    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
    { auto kb_end = std::chrono::high_resolution_clock::now();
      auto kb_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(kb_end - kb_begin);
      printf("Kernel %s compile time: %.3fs\n", name, kb_elapsed.count() * 1e-9);
    }

    auto begin = std::chrono::high_resolution_clock::now();

    Mem[0] = 1;
    q.submit([&](sycl::handler& h) {
      auto mem = bMem.get_access<sycl::access::mode::read_write>(h);
      h.use_kernel_bundle(kb);
      h.parallel_for(sycl::range(N), [=](sycl::id<1> n) {
        for (int i = 0; i != 1000*1000; ++ i)
          mem[n] = foo(mem[n], 0x3F400000);
      });
    });

    q.wait_and_throw();

    { auto end = std::chrono::high_resolution_clock::now();
      auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
      printf("Kernel %s execution time: %.3fs\n", name, elapsed.count() * 1e-9);
    }
  } catch (sycl::exception const& e) {
    std::cerr << "Caught synchronous SYCL exception:\n"  << e.what() << std::endl;
  }
}

int main() {
  do_sycl<as1>("as1");
  if (Mem[0][0] != 0.5f) {
    printf("FAILED: %f (%X)\n", Mem[0][0], *(unsigned*)Mem);
    return 1;
  }
  do_sycl<as2>("as2");
  if (Mem[0][0] != 0.5f) {
    printf("FAILED: %f (%X)\n", Mem[0][0], *(unsigned*)Mem);
    return 1;
  }
  puts("PASSED");
  return 0;
}

Test program output:

# clang++ -fsycl as.cpp && time ./a.out
Kernel as1 compile time: 0.115s
Kernel as1 execution time: 17.648s
Kernel as2 compile time: 0.000s
Kernel as2 execution time: 0.052s
PASSED

real    0m17.882s
user    0m7.578s
sys     0m10.291s

Environment (please complete the following information):

OS: Ubuntu 22.04
Target device and vendor: Intel Arc A380 GPU
DPC++ version: clang version 16.0.0 (https://github.com/intel/llvm 1299e21c883616aedd48495f77429f3d235cfaa4
Dependencies version: ghcr.io/intel/llvm/sycl_ubuntu2004_nightly:latest docker container

intel / llvm

sycl::vec "as" operation usage leads to significant performance drop #7901