Write a new test which covers 'advanced data uploading' from docs

In my opinion, it would be better to have a test which covers the topic of advanced data uploading in detail. The current sample code from the docs still has the following issues:

It's commented code, so it's not really run (compiler doesn't even check if syntax is valid for example)
The example from the docs only covers the use case of a uniform buffer

I think programmers could learn a lot a test that covers most common cases (vertex, index, and uniform buffer maybe?), especially when it comes to correct barrier placement. The benefit of such a test would be that we can run it with synchronization validation layers to ensure the barriers are correct. This would give new programmers a good, safe code to use as reference.

I propose to add the following test in Tests.cpp:

static void TestAdvancedDataUploading() {
    wprintf(L"Testing advanced data uploading\n");

    auto create_buffer = [](VkDeviceSize bufferSize, VkBufferUsageFlags bufferUsage, VmaAllocationCreateFlags allocationFlags,
        VkBuffer& buffer /*out*/, VmaAllocation& alloc /*out*/, VmaAllocationInfo& allocInfo /*out*/, void* myData,
        std::size_t myDataSize) {

        TEST(myData != nullptr);
        TEST(myDataSize != 0);

        VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
        bufCreateInfo.size = bufferSize;
        bufCreateInfo.usage = bufferUsage;

        VmaAllocationCreateInfo allocCreateInfo = {};
        allocCreateInfo.usage = VMA_MEMORY_USAGE_AUTO;
        allocCreateInfo.flags = allocationFlags;

        VkResult result = vmaCreateBuffer(g_hAllocator, &bufCreateInfo, &allocCreateInfo, &buffer, &alloc, &allocInfo);
        TEST(result == VK_SUCCESS);

        VkMemoryPropertyFlags memPropFlags;
        vmaGetAllocationMemoryProperties(g_hAllocator, alloc, &memPropFlags);

        BeginSingleTimeCommands();

        if (memPropFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
            // Allocation ended up in a mappable memory and is already mapped - write to it directly.
            memcpy(allocInfo.pMappedData, myData, myDataSize);
            result = vmaFlushAllocation(g_hAllocator, alloc, 0, VK_WHOLE_SIZE);
            TEST(result == VK_SUCCESS);

            VkBufferMemoryBarrier bufMemBarrier = { VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER };
            bufMemBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
            bufMemBarrier.dstAccessMask = VK_ACCESS_UNIFORM_READ_BIT;
            bufMemBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
            bufMemBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
            bufMemBarrier.buffer = buffer;
            bufMemBarrier.offset = 0;
            bufMemBarrier.size = VK_WHOLE_SIZE;

            vkCmdPipelineBarrier(g_hTemporaryCommandBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT,
                0, 0, nullptr, 1, &bufMemBarrier, 0, nullptr);

            EndSingleTimeCommands();
        }
        else {
            // Allocation ended up in a non-mappable memory - a transfer using a staging buffer is required.
            VkBufferCreateInfo stagingBufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
            stagingBufCreateInfo.size = bufferSize;
            stagingBufCreateInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;

            VmaAllocationCreateInfo stagingAllocCreateInfo = {};
            stagingAllocCreateInfo.usage = VMA_MEMORY_USAGE_AUTO;
            stagingAllocCreateInfo.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
                VMA_ALLOCATION_CREATE_MAPPED_BIT;

            VkBuffer stagingBuf;
            VmaAllocation stagingAlloc;
            VmaAllocationInfo stagingAllocInfo;
            result = vmaCreateBuffer(g_hAllocator, &stagingBufCreateInfo, &stagingAllocCreateInfo,
                &stagingBuf, &stagingAlloc, &stagingAllocInfo);
            TEST(result == VK_SUCCESS);

            memcpy(stagingAllocInfo.pMappedData, myData, myDataSize);
            result = vmaFlushAllocation(g_hAllocator, stagingAlloc, 0, VK_WHOLE_SIZE);
            TEST(result == VK_SUCCESS);

            VkBufferMemoryBarrier bufMemBarrier = { VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER };
            bufMemBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
            bufMemBarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
            bufMemBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
            bufMemBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
            bufMemBarrier.buffer = stagingBuf;
            bufMemBarrier.offset = 0;
            bufMemBarrier.size = VK_WHOLE_SIZE;

            vkCmdPipelineBarrier(g_hTemporaryCommandBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
                0, 0, nullptr, 1, &bufMemBarrier, 0, nullptr);

            VkBufferCopy bufCopy = {
                0, // srcOffset
                0, // dstOffset,
                myDataSize, // size
            };

            vkCmdCopyBuffer(g_hTemporaryCommandBuffer, stagingBuf, buffer, 1, &bufCopy);

            VkBufferMemoryBarrier bufMemBarrier2 = { VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER };
            bufMemBarrier2.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
            bufMemBarrier2.dstAccessMask = VK_ACCESS_UNIFORM_READ_BIT; // We created a uniform buffer
            bufMemBarrier2.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
            bufMemBarrier2.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
            bufMemBarrier2.buffer = buffer;
            bufMemBarrier2.offset = 0;
            bufMemBarrier2.size = VK_WHOLE_SIZE;

            vkCmdPipelineBarrier(g_hTemporaryCommandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT,
                0, 0, nullptr, 1, &bufMemBarrier2, 0, nullptr);

            EndSingleTimeCommands();

            vmaDestroyBuffer(g_hAllocator, stagingBuf, stagingAlloc);
        }
    };

    VkDeviceSize buffer_size = 65536;
    std::vector<std::uint8_t> myData(buffer_size);

    // Fill with random data
    for (std::size_t index = 0; index < buffer_size; index++) {
        myData[index] = static_cast<uint32_t>(rand());
    }

    // Create a uniform buffer
    VkBuffer uniformBuffer = VK_NULL_HANDLE;
    VmaAllocation uniformBufferAlloc{};
    VmaAllocationInfo uniformBufferAllocInfo{};
    create_buffer(buffer_size, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
        VMA_ALLOCATION_CREATE_MAPPED_BIT, uniformBuffer, uniformBufferAlloc, uniformBufferAllocInfo, myData.data(), buffer_size);

    // Create a vertex buffer
    VkBuffer vertexBuffer = VK_NULL_HANDLE;
    VmaAllocation vertexBufferAlloc{};
    VmaAllocationInfo vertexBufferAllocInfo{};
    create_buffer(buffer_size, VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, 0, vertexBuffer,
        vertexBufferAlloc, vertexBufferAllocInfo, myData.data(), buffer_size);

    // Index buffer
    VkBuffer indexBuffer = VK_NULL_HANDLE;
    VmaAllocation indexBufferAlloc{};
    VmaAllocationInfo indexBufferAllocInfo{};
    create_buffer(buffer_size, VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, 0, indexBuffer,
        indexBufferAlloc, indexBufferAllocInfo, myData.data(), buffer_size);

    vmaDestroyBuffer(g_hAllocator, uniformBuffer, uniformBufferAlloc);
    vmaDestroyBuffer(g_hAllocator, vertexBuffer, vertexBufferAlloc);
    vmaDestroyBuffer(g_hAllocator, indexBuffer, indexBufferAlloc);
}

This code works on NVIDIA RTX 3090, AMD Ryzen™ 9 7950X, and Intel Arc A770 without validation warnings or errors for TestAdvancedDataUploading.

I like the idea of adding this test. I found few issues with the proposed code:

Major issues:

You test uniform, vertex, and index buffer, but inside the lambda function you always transition to VK_ACCESS_UNIFORM_READ_BIT.

You use flags VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT only on the first buffer and not on the other two. You forgot to use VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT. On the other hand, the first one has VK_BUFFER_USAGE_TRANSFER_DST_BIT missing, which you need to use in case "allow transfer instead" logic is hit.

Minor issues:

myData[index] = static_cast<uint32_t>(rand());

myData elements are of type uint8_t and you cast to uint32_t. Besides that, you use uint32_t without std:: prefix (which I prefer) but for the vector you use std::uint8_t.

    VkBuffer uniformBuffer = VK_NULL_HANDLE;
    VmaAllocation uniformBufferAlloc{};

You initialize one with VK_NULL_HANDLE but another one with default initialization. I recommend to be consistent and to use VK_NULL_HANDLE everywhere, like it is done in other places of the code.

buffer_size could be captured by lambda captures clause instead of parameter, and so does myData. buffer_size is passed twice unnecesarily.

Alright I will add your fixes and open a pull request.

I think it's somehow possible to label this test as a doxygen "\snippet" so it can be referenced in the docs instead of the commented code. This way, we display real code which is working in the docs and avoid unnecessary duplicates. I will look into this.

I still have some problems to understand which combination of VmaAllocationCreateFlags is correct my the use cases I want to test for. (Adding this test is also a learning process for me.)

I did not set any VmaAllocationCreateFlags for the vertex and index buffer, but I used VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT only for the uniform buffer. I though this way I get mappable memory for the uniform buffer, which could then be updated with std::memcpy if we assume its memory changes frequently.
If I specify VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT for the vertex and index buffer, I always get true in (memPropFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) (tested on RTX 3090, Intel Arc A770, and AMD Ryzen 9 7950X integrated graphics). The transfer case is never hit. But what if the data of the vertex and index buffer are constant and not updated frequently? I thought I should prefer a transfer then instead, hoping it ends up in DEVICE_LOCAL memory, avoiding mapping entirely. By specifying VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT instead of 0, I never end up in the transfer case though. This is not affected by switching from VMA_MEMORY_USAGE_AUTO to VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE in allocCreateInfo either.
Should we actually test all combinations? (the 3 buffers, all with VmaAllocationCreateFlags set to either 0 or VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, giving us 6 buffers to create in this test).
I think I should make it clear in the test for which use case I am creating the buffers, so the reader can also learn from it.
Maybe the question of which flags to use for what use case can also help to improve the docs?

It depends on what do you want to test or demonstrate with your code.

The sample code from "Advanced data uploading" documentation chapter demonstrate the usage of VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, so I assume we need to always use those. If you want to test an approach with always using a staging buffer, I recommend to write it as a separate code so that it is clearer for beginner users.

I think it doesn't make much sense to test uniform / vertex / index buffer, as they are all very similar, differ only by buffer usage and barrier dstAccessMask. Trying to test these 3 types of buffers while also trying the "always staging buffer" vs ALLOW_TRANSFER_INSTEAD at the same time introduces additional confusion, in my opinion.

I recommend to maybe go with a single usage (e.g. uniform buffer) but write 3 separate tests:

The approach that always uses staging buffer.
The approach that requires the memory to be CPU-writable VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT.
The approach with ALLOW_TRANSFER_INSTEAD and checking if(memPropFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT).

What do you think?

It depends on what do you want to test or demonstrate with your code.

The sample code from "Advanced data uploading" documentation chapter demonstrate the usage of VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, so I assume we need to always use those. If you want to test an approach with always using a staging buffer, I recommend to write it as a separate code so that it is clearer for beginner users.

I think it doesn't make much sense to test uniform / vertex / index buffer, as they are all very similar, differ only by buffer usage and barrier dstAccessMask. Trying to test these 3 types of buffers while also trying the "always staging buffer" vs ALLOW_TRANSFER_INSTEAD at the same time introduces additional confusion, in my opinion.

I recommend to maybe go with a single usage (e.g. uniform buffer) but write 3 separate tests:
1. The approach that always uses staging buffer.

2. The approach that requires the memory to be CPU-writable `VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT`.

3. The approach with `ALLOW_TRANSFER_INSTEAD` and checking `if(memPropFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)`.
What do you think?

I agree, I write the tests that way.

GPUOpen-LibrariesAndSDKs / VulkanMemoryAllocator

Write a new test which covers 'advanced data uploading' from docs #433