Closed glebov-andrey closed 1 year ago
@glebov-andrey
Hi, Could you please provide your application test?
Thanks
Of course, here is a simplified example:
// This assumes some boilerplate setup has already been done.
// `device` has at a compute queue (has COMPUTE_BIT, but not GRAPHICS_BIT, in my case family 1).
// `command_buffer` was allocated from a VkCommandPool, which was created for that queue.
// Error handling has been omitted.
void test(const VkDevice device,
const VkPhysicalDeviceMemoryProperties memory_properties,
const VkCommandBuffer command_buffer) {
const auto find_memory_type = [&](const std::uint32_t memory_type_mask,
const VkMemoryPropertyFlags required_properties) {
for (std::uint32_t idx = 0; idx != memory_properties.memoryTypeCount; ++idx) {
if (memory_type_mask & (std::uint32_t{1} << idx)) {
if ((memory_properties.memoryTypes[idx].propertyFlags & required_properties) == required_properties) {
return idx;
}
}
}
assert(!"memory type not found");
};
constexpr std::uint32_t image_extent = 16;
// For BC3_UNORM_BLOCK:
constexpr std::uint32_t block_extent = 4;
constexpr vk::DeviceSize block_size = 16;
constexpr auto block_count = (image_extent / block_extent) * (image_extent / block_extent);
constexpr auto buffer_size = block_count * block_size;
VkBuffer buffer = nullptr;
const VkBufferCreateInfo buffer_create_info = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = {},
.size = buffer_size,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr};
vkCreateBuffer(device, &buffer_create_info, nullptr, &buffer);
VkMemoryRequirements buffer_memory_requirements = {};
vkGetBufferMemoryRequirements(device, buffer, &buffer_memory_requirements);
assert(buffer_memory_requirements.alignment == 4);
const auto buffer_memory_type = find_memory_type(buffer_memory_requirements.memoryTypeBits,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
VkDeviceMemory buffer_memory = nullptr;
const VkMemoryAllocateInfo buffer_allocate_info = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = nullptr,
.allocationSize = buffer_memory_requirements.size + buffer_memory_requirements.alignment,
.memoryTypeIndex = buffer_memory_type};
vkAllocateMemory(device, &buffer_allocate_info, nullptr, &buffer_memory);
vkBindBufferMemory(device, buffer, buffer_memory, buffer_memory_requirements.alignment); // minimum valid alignment
VkImage image = nullptr;
const VkImageCreateInfo image_create_info = {.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
.pNext = nullptr,
.flags = {},
.imageType = VK_IMAGE_TYPE_2D,
.format = VK_FORMAT_BC3_UNORM_BLOCK,
.extent = {.width = image_extent, .height = image_extent, .depth = 1},
.mipLevels = 1,
.arrayLayers = 1,
.samples = VK_SAMPLE_COUNT_1_BIT,
.tiling = VK_IMAGE_TILING_OPTIMAL,
.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED};
vkCreateImage(device, &image_create_info, nullptr, &image);
VkMemoryRequirements image_memory_requirements = {};
vkGetImageMemoryRequirements(device, image, &image_memory_requirements);
const auto image_memory_type = find_memory_type(image_memory_requirements.memoryTypeBits,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
VkDeviceMemory image_memory = nullptr;
const VkMemoryAllocateInfo image_allocate_info = {.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = nullptr,
.allocationSize = image_memory_requirements.size,
.memoryTypeIndex = image_memory_type};
vkAllocateMemory(device, &image_allocate_info, nullptr, &image_memory);
vkBindImageMemory(device, image, image_memory, 0);
const VkCommandBufferBeginInfo begin_info = {.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
.pNext = nullptr,
.flags = {},
.pInheritanceInfo = nullptr};
vkBeginCommandBuffer(command_buffer, &begin_info);
// The application actually dispatches a compute pipeline to fill the buffer, but it doesn't matter for this case.
const VkImageMemoryBarrier image_memory_barrier = {.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = {},
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
.subresourceRange = {.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.baseMipLevel = 0,
.levelCount = 1,
.baseArrayLayer = 0,
.layerCount = 1}};
vkCmdPipelineBarrier(command_buffer,
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, // edited to work without synchronization2
VK_PIPELINE_STAGE_TRANSFER_BIT,
{},
0,
nullptr,
0,
nullptr,
1,
&image_memory_barrier);
const VkBufferImageCopy copy_region = {.bufferOffset = 0,
.bufferRowLength = 0,
.bufferImageHeight = 0,
.imageSubresource = {.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.mipLevel = 0,
.baseArrayLayer = 0,
.layerCount = 1},
.imageOffset = {},
.imageExtent = image_create_info.extent};
vkCmdCopyBufferToImage(command_buffer, buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©_region); // FAILS HERE
vkEndCommandBuffer(command_buffer);
vkDestroyImage(device, image, nullptr);
vkFreeMemory(device, image_memory, nullptr);
vkDestroyBuffer(device, buffer, nullptr);
vkFreeMemory(device, buffer_memory, nullptr);
}
@glebov-andrey Hello, I can't reproduce this issue. Could you please provide a complete test code? If possible, you can send the complete code or project to my email: robin-morris@foxmail.com
Thanks
@MrRobbin I will make a full example and post it as a Gist or something, but I'll only be able to after the 8th of January.
Until then my best guess is that it might be device-dependent and/or queue family dependent - it only seems to happen in the compute queue.
@MrRobbin The following VkInstance
and VkDevice
initialization code is enough to cause the assertion (when combined with the previously provided code):
void test() {
VkInstance instance = nullptr;
const VkApplicationInfo application_info = {.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pNext = nullptr,
.pApplicationName = "test-app",
.applicationVersion = VK_MAKE_API_VERSION(0, 0, 1, 0),
.pEngineName = "test-engine",
.engineVersion = VK_MAKE_API_VERSION(0, 0, 1, 0),
.apiVersion = VK_API_VERSION_1_3};
const VkInstanceCreateInfo instance_create_info = {.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pNext = nullptr,
.flags = {},
.pApplicationInfo = &application_info,
.enabledLayerCount = 0,
.ppEnabledLayerNames = nullptr,
.enabledExtensionCount = 0,
.ppEnabledExtensionNames = nullptr};
vkCreateInstance(&instance_create_info, nullptr, &instance);
std::uint32_t physical_device_count = 0;
vkEnumeratePhysicalDevices(instance, &physical_device_count, nullptr);
assert(physical_device_count == 1);
VkPhysicalDevice physical_device = nullptr;
vkEnumeratePhysicalDevices(instance, &physical_device_count, &physical_device);
std::uint32_t queue_family_count = 0;
vkGetPhysicalDeviceQueueFamilyProperties(physical_device, &queue_family_count, nullptr);
assert(queue_family_count == 3);
std::array<VkQueueFamilyProperties, 3> queue_family_properties = {};
vkGetPhysicalDeviceQueueFamilyProperties(physical_device, &queue_family_count, queue_family_properties.data());
constexpr std::uint32_t queue_family_index = 0;
constexpr VkQueueFlags expected_queue_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT |
VK_QUEUE_SPARSE_BINDING_BIT;
assert(queue_family_properties[queue_family_index].queueFlags == expected_queue_flags);
VkPhysicalDeviceMemoryProperties memory_properties = {};
vkGetPhysicalDeviceMemoryProperties(physical_device, &memory_properties);
VkDevice device = nullptr;
constexpr auto queue_priority = 1.0f;
const VkDeviceQueueCreateInfo device_queue_create_info = {.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.pNext = nullptr,
.flags = {},
.queueFamilyIndex = queue_family_index,
.queueCount = 1,
.pQueuePriorities = &queue_priority};
const VkPhysicalDeviceFeatures enabled_features_1_0 = {};
const VkDeviceCreateInfo device_create_info = {.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.pNext = nullptr,
.flags = {},
.queueCreateInfoCount = 1,
.pQueueCreateInfos = &device_queue_create_info,
.enabledLayerCount = 0,
.ppEnabledLayerNames = nullptr,
.enabledExtensionCount = 0,
.ppEnabledExtensionNames = nullptr,
.pEnabledFeatures = &enabled_features_1_0};
vkCreateDevice(physical_device, &device_create_info, nullptr, &device);
VkCommandPool command_pool = nullptr;
const VkCommandPoolCreateInfo command_pool_create_info = {.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.pNext = nullptr,
.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,
.queueFamilyIndex = queue_family_index};
vkCreateCommandPool(device, &command_pool_create_info, nullptr, &command_pool);
VkCommandBuffer command_buffer = nullptr;
const VkCommandBufferAllocateInfo command_buffer_allocate_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.pNext = nullptr,
.commandPool = command_pool,
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1};
vkAllocateCommandBuffers(device, &command_buffer_allocate_info, &command_buffer);
// ... the code from my previous comment ...
vkDestroyCommandPool(device, command_pool, nullptr);
vkDestroyDevice(device, nullptr);
vkDestroyInstance(instance, nullptr);
}
I've confirmed that the issue is queue family dependent - the assertion only fails in a GRAPHICS
(family 0
) or COMPUTE
(family 1
) queue.
The GPU I'm using is an AMD Radeon RX 6900 XT (0x73bf)
.
I've also rebuilt AMDVLK from scratch using the current dev
branch, and the issue still exists.
Here's the full stack trace from an unoptimized debug build:
[amdvlk64.so] Pal::Gfx9::Device::Gfx10CreateTypedBufferViewSrds gfx9Device.cpp:3747
[amdvlk64.so] Pal::IDevice::CreateTypedBufferViewSrds palDevice.h:3926
[amdvlk64.so] Pal::RsrcProcMgr::CopyBetweenMemoryAndImageCs rsrcProcMgr.cpp:1235
[amdvlk64.so] Pal::RsrcProcMgr::CopyBetweenMemoryAndImage rsrcProcMgr.cpp:1304
[amdvlk64.so] Pal::RsrcProcMgr::CmdCopyMemoryToImage rsrcProcMgr.cpp:967
[amdvlk64.so] Pal::Gfx9::RsrcProcMgr::CmdCopyMemoryToImage gfx9RsrcProcMgr.cpp:3918
[amdvlk64.so] Pal::GfxCmdBuffer::CmdCopyMemoryToImage gfxCmdBuffer.cpp:358
[amdvlk64.so] vk::CmdBuffer::PalCmdCopyMemoryToImage vk_cmdbuffer.cpp:1317
[amdvlk64.so] vk::CmdBuffer::CopyBufferToImage<VkBufferImageCopy> vk_cmdbuffer.cpp:3548
[amdvlk64.so] vk::entry::vkCmdCopyBufferToImage entry.cpp:370
[libvulkan.so.1] vkCmdCopyBufferToImage 0x00007ffff7ceb41e
@glebov-andrey Thanks, I have reproduced this issue.
@glebov-andrey Hello, this assert is not necessary, Could you please try the release driver?
@MrRobbin Everything appears to work with a release build of the driver with assertions disabled.
My concern was that either this was a driver/hardware constraint which was not disclosed by the properties/limits API, or I misunderstood the specification regarding copyBufferToImage
alignment requirements.
Just to confirm, this means that the only alignment requirement for the buffer in this case is that returned by vkGetBufferMemoryRequirements
?
@glebov-andrey Sorry for the late response. Let me make a brief summary.
At first, I don't find the 'alignment requirement' in Vulkan spec. I can only find from the 'vkCmdCopyBufferToImage(...)' is that 'Copy regions for the image must be aligned to a multiple of the texel block extent in each dimension, except at the edges of the image, where region extents must match the edge of the image' but this is not releated to this issue in my view.
Back to the assert failed itself, here is the BufferViewInfo structure, the assert failed at '(view.gpuAddr % requiredAlignment) != 0' .
Struct BufferViewInfo {
Gpusize gpuAddr ; //Gpu memory virtual address where the buffer view starts, in bytes. Must be aligned to bytes-per-element for typed access.
…
}
In amdvlk, we just do a simple solution about the alignment. We make the memory's virtual address has a large alignment when it was allocated and this seems can guarantee all formats assert success. But your app's vkBindBufferMemory(…, buffer_memroy_requirements.alignment) will destory the alignment. The minium memory alignment and the
view.gpuAddr's required alignment have different meanings. Sometimes, we can use the vkBindBufferMemory(...)
to satisfy the assert. For example, the 'BC3_UNORM_BLOCK' block size is equals to 16, so the bind offset should be multiple of 16. And we can combine the vkGetBufferMemoryRequirements(...)
and the block size to determine a specified block's bind offset. In this way, it can avoid assert failures.
To sum up, Vulkan spec don't explicitly specify the alignment, so it doesn't matter.
Please check the fix with 2023.Q2.1 releas
Thanks, it's working fine now.
Hi, I'm getting an assertion which appears to be incorrect while copying from a staging buffer to an image.
The application creates a buffer with a 4-byte aligned allocation. This is the alignment reported as required for
STORAGE | TRANSFER_SRC
. The buffer is filled with BC3 compressed data, and avkCmdCopyBufferToImage
command is issued (in a compute queue). The driver then traps here because(view.gpuAddr % requiredAlignment) != 0
insideIsValidTypedBufferView
. Here's the full stack trace:requiredAlignment
here isbytesPerPixel
, which is actually more like "bytes per texel block" (for BC3 this value is16
).20.3. Copying Data Between Buffers and Images specifies how offsets are calculated, but unlike e.g. 22.4. Vertex Input Address Calculation, it does not seem to impose any alignment requirements on texel addresses. The only obviously applicable alignment requirement I could find in the spec. is that of a
TRANSFER_SRC
buffer. Perhaps there is also the natural alignment of image component data types, but I couldn't find any explicit mention of it (except for vertex input attributes), and I'm not sure what that would be for block compressed formats.Judging by the stack trace, buffer-to-image copies use the same machinery as texel buffers. But i couldn't find any connection between the two in the spec. Also, even for texel buffers, this alignment requirement is excessive because
minTexelBufferOffsetAlignment
reports4
.