wangzy0327 commented 6 months ago

Summary

I tried to run matmul primitive refer to official-example

It occured the bug.

derived_type dnnl::impl::utils::downcast(base_type *) 
[derived_type = dnnl::impl::sycl::sycl_buffer_memory_storage_t *, base_type = dnnl::impl::memory_storage_t]: 
Assertion `dynamic_cast<derived_type>(base) == base' failed.
Aborted (core dumped)

Version

oneDNN version : v3.2 githash : 04b180b sycl version: 2022-06 release sycl githash : 4043dda

Environment

OS version : Linux gxnzx1277 4.15.0-188-generic #199-Ubuntu SMP Wed Jun 15 20:42:56 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
Compiler version : gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
CMake version : cmake version 3.19.5

Steps to reproduce

The program is as follow.

matmul.cpp

``` /******************************************************************************* * Copyright 2020-2022 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ #include #include #include #include #include #include "example_utils.hpp" #include "oneapi/dnnl/dnnl.hpp" //compile // clang++ -fsycl -fsycl-targets=mlisa-cambricon-bang matmul.cpp -o matmul.out -ldnnl //execuate // ./matmul.out gpu using namespace dnnl; using tag = memory::format_tag; using dt = memory::data_type; void matmul_example(dnnl::engine::kind engine_kind) { // Create execution dnnl::engine. dnnl::engine engine(engine_kind, 0); // Create dnnl::stream. dnnl::stream engine_stream(engine); // Tensor dimensions. const memory::dim MB = 3, // batch size M = 128, K = 256, N = 512; // Source (src), weights, bias, and destination (dst) tensors dimensions. memory::dims src_dims = {MB, M, K}; memory::dims weights_dims = {MB, K, N}; memory::dims bias_dims = {1, 1, N}; memory::dims dst_dims = {MB, M, N}; // Allocate buffers. std::vector src_data(product(src_dims)); std::vector weights_data(product(weights_dims)); std::vector bias_data(product(bias_dims)); std::vector dst_data(product(dst_dims)); // Initialize src, weights, bias. std::generate(src_data.begin(), src_data.end(), []() { static int i = 0; return std::cos(i++ / 10.f); }); std::generate(weights_data.begin(), weights_data.end(), []() { static int i = 0; return std::sin(i++ * 2.f); }); std::generate(bias_data.begin(), bias_data.end(), []() { static int i = 0; return std::tanh(float(i++)); }); sycl::buffer buffer_src(static_cast(src_data.data()), sycl::range<3>(MB, M, K)); sycl::buffer buffer_weights(static_cast(weights_data.data()), sycl::range<3>(MB, K, N)); sycl::buffer buffer_bias(static_cast(bias_data.data()), sycl::range<3>(1, 1, N)); sycl::buffer buffer_dst(static_cast(dst_data.data()), sycl::range<3>(MB, M, N)); // Create memory descriptors and memory objects for src, weights, bias, and // dst. auto src_md = memory::desc(src_dims, dt::f32, tag::abc); auto weights_md = memory::desc(weights_dims, dt::f32, tag::abc); auto bias_md = memory::desc(bias_dims, dt::f32, tag::abc); auto dst_md = memory::desc(dst_dims, dt::f32, tag::abc); auto src_mem = memory(src_md, engine); auto weights_mem = memory(weights_md, engine); auto bias_mem = memory(bias_md, engine); auto dst_mem = memory(dst_md, engine); // auto src_mem = dnnl::sycl_interop::make_memory(src_md, engine, buffer_src); // auto weights_mem = dnnl::sycl_interop::make_memory(weights_md, engine, buffer_weights); // auto bias_mem = dnnl::sycl_interop::make_memory(bias_md, engine, buffer_bias); // auto dst_mem = dnnl::sycl_interop::make_memory(dst_md, engine, buffer_dst); // Write data to memory object's handles. write_to_dnnl_memory(src_data.data(), src_mem); write_to_dnnl_memory(weights_data.data(), weights_mem); write_to_dnnl_memory(bias_data.data(), bias_mem); // Create primitive post-ops (ReLU). // const float alpha = 0.f; // const float beta = 0.f; // post_ops matmul_ops; // matmul_ops.append_eltwise(algorithm::eltwise_relu, alpha, beta); // primitive_attr matmul_attr; // matmul_attr.set_post_ops(matmul_ops); auto matmul_d = matmul::desc(src_md,weights_md,bias_md,dst_md); // Create primitive descriptor. // auto matmul_pd = matmul::primitive_desc( // engine, src_md, weights_md, bias_md, dst_md, matmul_attr); auto matmul_pd = matmul::primitive_desc( matmul_d, engine); // Create the primitive. auto matmul_prim = matmul(matmul_pd); // Primitive arguments. std::unordered_map matmul_args; matmul_args.insert({DNNL_ARG_SRC, src_mem}); matmul_args.insert({DNNL_ARG_WEIGHTS, weights_mem}); matmul_args.insert({DNNL_ARG_BIAS, bias_mem}); matmul_args.insert({DNNL_ARG_DST, dst_mem}); // Primitive execution: matrix multiplication with ReLU. matmul_prim.execute(engine_stream, matmul_args); // Wait for the computation to finalize. engine_stream.wait(); // Read data from memory object's handle. read_from_dnnl_memory(dst_data.data(), dst_mem); } int main(int argc, char **argv) { return handle_example_errors(matmul_example, parse_engine_kind(argc, argv)); } ```

example_utils.hpp

/******************************************************************************* * Copyright 2019-2021 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ #ifndef EXAMPLE_UTILS_HPP #define EXAMPLE_UTILS_HPP #include #include #include #include #include #include #include #include #include #include "dnnl.hpp" #include "dnnl_debug.h" #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL #include "dnnl_ocl.hpp" #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL #include "dnnl_sycl.hpp" #endif #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP #ifdef _MSC_VER #define PRAGMA_MACRo(x) __pragma(x) #define PRAGMA_MACRO(x) PRAGMA_MACRo(x) #else #define PRAGMA_MACRo(x) _Pragma(#x) #define PRAGMA_MACRO(x) PRAGMA_MACRo(x) #endif // MSVC doesn't support collapse clause in omp parallel #if defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) #define collapse(x) #endif #define PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(n) PRAGMA_MACRO(omp parallel for collapse(n)) #else // DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP #define PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(n) #endif dnnl::engine::kind validate_engine_kind(dnnl::engine::kind akind) { // Checking if a GPU exists on the machine if (akind == dnnl::engine::kind::gpu) { if (dnnl::engine::get_count(dnnl::engine::kind::gpu) == 0) { std::cout << "Application couldn't find GPU, please run with CPU " "instead.\n"; exit(0); } } return akind; } // Exception class to indicate that the example uses a feature that is not // available on the current systems. It is not treated as an error then, but // just notifies a user. struct example_allows_unimplemented : public std::exception { example_allows_unimplemented(const char *message) noexcept : message(message) {} const char *what() const noexcept override { return message; } const char *message; }; inline const char *engine_kind2str_upper(dnnl::engine::kind kind); // Runs example function with signature void() and catches errors. // Returns `0` on success, `1` or oneDNN error, and `2` on example error. inline int handle_example_errors( std::initializer_list engine_kinds, std::function example) { int exit_code = 0; try { example(); } catch (example_allows_unimplemented &e) { std::cout << e.message << std::endl; exit_code = 0; } catch (dnnl::error &e) { std::cout << "oneDNN error caught: " << std::endl << "\tStatus: " << dnnl_status2str(e.status) << std::endl << "\tMessage: " << e.what() << std::endl; exit_code = 1; } catch (std::exception &e) { std::cout << "Error in the example: " << e.what() << "." << std::endl; exit_code = 2; } std::string engine_kind_str; for (auto it = engine_kinds.begin(); it != engine_kinds.end(); ++it) { if (it != engine_kinds.begin()) engine_kind_str += "/"; engine_kind_str += engine_kind2str_upper(*it); } std::cout << "Example " << (exit_code ? "failed" : "passed") << " on " << engine_kind_str << "." << std::endl; return exit_code; } // Same as above, but for functions with signature // void(dnnl::engine::kind engine_kind, int argc, char **argv). inline int handle_example_errors( std::function example, dnnl::engine::kind engine_kind, int argc, char **argv) { return handle_example_errors( {engine_kind}, [&]() { example(engine_kind, argc, argv); }); } // Same as above, but for functions with signature void(dnnl::engine::kind). inline int handle_example_errors( std::function example, dnnl::engine::kind engine_kind) { return handle_example_errors( {engine_kind}, [&]() { example(engine_kind); }); } inline dnnl::engine::kind parse_engine_kind( int argc, char **argv, int extra_args = 0) { // Returns default engine kind, i.e. CPU, if none given if (argc == 1) { return validate_engine_kind(dnnl::engine::kind::cpu); } else if (argc <= extra_args + 2) { std::string engine_kind_str = argv[1]; // Checking the engine type, i.e. CPU or GPU if (engine_kind_str == "cpu") { return validate_engine_kind(dnnl::engine::kind::cpu); } else if (engine_kind_str == "gpu") { return validate_engine_kind(dnnl::engine::kind::gpu); } } // If all above fails, the example should be ran properly std::cout << "Inappropriate engine kind." << std::endl << "Please run the example like this: " << argv[0] << " [cpu|gpu]" << (extra_args ? " [extra arguments]" : "") << "." << std::endl; exit(1); } inline const char *engine_kind2str_upper(dnnl::engine::kind kind) { if (kind == dnnl::engine::kind::cpu) return "CPU"; if (kind == dnnl::engine::kind::gpu) return "GPU"; assert(!"not expected"); return ""; } inline dnnl::memory::dim product(const dnnl::memory::dims &dims) { return std::accumulate(dims.begin(), dims.end(), (dnnl::memory::dim)1, std::multiplies()); } // Read from memory, write to handle inline void read_from_dnnl_memory(void *handle, dnnl::memory &mem) { dnnl::engine eng = mem.get_engine(); size_t size = mem.get_desc().get_size(); if (!handle) throw std::runtime_error("handle is nullptr."); #ifdef DNNL_WITH_SYCL bool is_cpu_sycl = (DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL && eng.get_kind() == dnnl::engine::kind::cpu); bool is_gpu_sycl = (DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL && eng.get_kind() == dnnl::engine::kind::gpu); if (is_cpu_sycl || is_gpu_sycl) { auto mkind = dnnl::sycl_interop::get_memory_kind(mem); if (mkind == dnnl::sycl_interop::memory_kind::buffer) { auto buffer = dnnl::sycl_interop::get_buffer(mem); auto src = buffer.get_access<::sycl::access::mode::read>(); uint8_t *src_ptr = src.get_pointer(); if (!src_ptr) throw std::runtime_error("get_pointer returned nullptr."); for (size_t i = 0; i < size; ++i) ((uint8_t *)handle)[i] = src_ptr[i]; } else { assert(mkind == dnnl::sycl_interop::memory_kind::usm); uint8_t *src_ptr = (uint8_t *)mem.get_data_handle(); if (!src_ptr) throw std::runtime_error("get_data_handle returned nullptr."); if (is_cpu_sycl) { for (size_t i = 0; i < size; ++i) ((uint8_t *)handle)[i] = src_ptr[i]; } else { auto sycl_queue = dnnl::sycl_interop::get_queue(dnnl::stream(eng)); sycl_queue.memcpy(handle, src_ptr, size).wait(); } } return; } #endif #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL if (eng.get_kind() == dnnl::engine::kind::gpu) { void *mapped_ptr = mem.map_data(); if (mapped_ptr) std::memcpy(handle, mapped_ptr, size); mem.unmap_data(mapped_ptr); return; } #endif if (eng.get_kind() == dnnl::engine::kind::cpu) { uint8_t *src = static_cast(mem.get_data_handle()); if (!src) throw std::runtime_error("get_data_handle returned nullptr."); for (size_t i = 0; i < size; ++i) ((uint8_t *)handle)[i] = src[i]; return; } assert(!"not expected"); } // Read from handle, write to memory inline void write_to_dnnl_memory(void *handle, dnnl::memory &mem) { dnnl::engine eng = mem.get_engine(); size_t size = mem.get_desc().get_size(); if (!handle) throw std::runtime_error("handle is nullptr."); #ifdef DNNL_WITH_SYCL bool is_cpu_sycl = (DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL && eng.get_kind() == dnnl::engine::kind::cpu); bool is_gpu_sycl = (DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL && eng.get_kind() == dnnl::engine::kind::gpu); if (is_cpu_sycl || is_gpu_sycl) { auto mkind = dnnl::sycl_interop::get_memory_kind(mem); if (mkind == dnnl::sycl_interop::memory_kind::buffer) { auto buffer = dnnl::sycl_interop::get_buffer(mem); auto dst = buffer.get_access<::sycl::access::mode::write>(); uint8_t *dst_ptr = dst.get_pointer(); if (!dst_ptr) throw std::runtime_error("get_pointer returned nullptr."); for (size_t i = 0; i < size; ++i) dst_ptr[i] = ((uint8_t *)handle)[i]; } else { assert(mkind == dnnl::sycl_interop::memory_kind::usm); uint8_t *dst_ptr = (uint8_t *)mem.get_data_handle(); if (!dst_ptr) throw std::runtime_error("get_data_handle returned nullptr."); if (is_cpu_sycl) { for (size_t i = 0; i < size; ++i) dst_ptr[i] = ((uint8_t *)handle)[i]; } else { auto sycl_queue = dnnl::sycl_interop::get_queue(dnnl::stream(eng)); sycl_queue.memcpy(dst_ptr, handle, size).wait(); } } return; } #endif #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL if (eng.get_kind() == dnnl::engine::kind::gpu) { void *mapped_ptr = mem.map_data(); if (mapped_ptr) std::memcpy(mapped_ptr, handle, size); mem.unmap_data(mapped_ptr); return; } #endif if (eng.get_kind() == dnnl::engine::kind::cpu) { uint8_t *dst = static_cast(mem.get_data_handle()); if (!dst) throw std::runtime_error("get_data_handle returned nullptr."); for (size_t i = 0; i < size; ++i) dst[i] = ((uint8_t *)handle)[i]; return; } assert(!"not expected"); } #endif

/home/wzy/sycl_workspace/build-cuda-2022-06-debug/bin/clang++ \
-g -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --offload-arch=sm_70 matmul.cpp -o matmul-v32-ano.out -ldnnl

Observed behavior

The output of run the program is as follow.

matmul-v32.out: /home/wzy/sycl_workspace/oneDNN/src/common/utils.hpp:474: derived_type dnnl::impl::utils::downcast(base_type *) [derived_type = dnnl::impl::sycl::sycl_buffer_memory_storage_t *, base_type = dnnl::impl::memory_storage_t]: Assertion `dynamic_cast<derived_type>(base) == base' failed.
Aborted (core dumped)

Expected behavior

Document behavior you expect.

dzarukin commented 6 months ago

Hi @wangzy0327, have you tried the latest version of oneDNN?

wangzy0327 commented 6 months ago

@dzarukin I need to specify a stable version of oneDNN and it needs to be compiled into a debug version. I have tried to compiled oneDNN v3.2 /v3.4 on Nvidia sycl-platform in debug version.But it failed. like the issue-5980