Vulkan: Store Vertex, Index and Storage Buffers on Device Memory

Currently all buffer types were stored in host memory, which is visible to the GPU as well.
This is typically slow as the data would be transferred over the PCI bus when used.

Most of the time Index and Vertex buffers are written once and read many times so it makes
more sense to locate them on the GPU. Storage buffers typically require quick access as they
are created for shading/compute purposes.

This PR will try to store vertex buffers, index buffers and storage buffers on device memory
to improve the performance.

Uniform buffers are still located on host memory as they can be uploaded during binding process.
This can (will) reset the graphics pipeline triggering draw calls using unattached resources.

In future this could be optimized further as in:
* using different pools for allocating specific buffers, with a fallback when buffers cannot be
  stored on the GPU anymore.
* store uniform buffers in device memory

Pull Request: https://projects.blender.org/blender/blender/pulls/115343
This commit is contained in:
Jeroen Bakker 2023-11-24 13:52:48 +01:00
parent ab7505c7ce
commit d09d93febf
16 changed files with 289 additions and 42 deletions

View File

@ -999,7 +999,6 @@ GHOST_TSuccess GHOST_ContextVK::initializeDrawingContext()
}
extensions_device.push_back(VK_KHR_DEDICATED_ALLOCATION_EXTENSION_NAME);
extensions_device.push_back(VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME);
requireExtension(extensions_available, extensions_enabled, VK_EXT_MEMORY_BUDGET_EXTENSION_NAME);
/* Enable MoltenVK required instance extensions. */
#ifdef VK_MVK_MOLTENVK_EXTENSION_NAME

View File

@ -227,6 +227,7 @@ set(VULKAN_SRC
vulkan/vk_shader.cc
vulkan/vk_shader_interface.cc
vulkan/vk_shader_log.cc
vulkan/vk_staging_buffer.cc
vulkan/vk_state_manager.cc
vulkan/vk_storage_buffer.cc
vulkan/vk_texture.cc
@ -266,6 +267,7 @@ set(VULKAN_SRC
vulkan/vk_shader.hh
vulkan/vk_shader_interface.hh
vulkan/vk_shader_log.hh
vulkan/vk_staging_buffer.hh
vulkan/vk_state_manager.hh
vulkan/vk_storage_buffer.hh
vulkan/vk_texture.hh

View File

@ -26,20 +26,24 @@ void VKBatch::draw_setup()
/* Finalize graphics pipeline */
VKContext &context = *VKContext::get();
VKStateManager &state_manager = context.state_manager_get();
state_manager.apply_state();
state_manager.apply_bindings();
VKVertexAttributeObject vao;
vao.update_bindings(context, *this);
context.bind_graphics_pipeline(prim_type, vao);
/* Bind geometry resources. */
vao.bind(context);
VKIndexBuffer *index_buffer = index_buffer_get();
const bool draw_indexed = index_buffer != nullptr;
state_manager.apply_state();
state_manager.apply_bindings();
/*
* The next statements are order dependent. VBOs and IBOs must be uploaded, before resources can
* be bound. Uploading device located buffers flush the graphics pipeline and already bound
* resources will be unbound.
*/
VKVertexAttributeObject vao;
vao.update_bindings(context, *this);
vao.ensure_vbos_uploaded();
if (draw_indexed) {
index_buffer->upload_data();
index_buffer->bind(context);
}
vao.bind(context);
context.bind_graphics_pipeline(prim_type, vao);
}
void VKBatch::draw(int vertex_first, int vertex_count, int instance_first, int instance_count)

View File

@ -28,12 +28,11 @@ static VmaAllocationCreateFlags vma_allocation_flags(GPUUsageType usage)
{
switch (usage) {
case GPU_USAGE_STATIC:
case GPU_USAGE_DEVICE_ONLY:
return 0;
case GPU_USAGE_DYNAMIC:
case GPU_USAGE_STREAM:
return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
case GPU_USAGE_DEVICE_ONLY:
return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT |
VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT;
case GPU_USAGE_FLAG_BUFFER_TEXTURE_ONLY:
break;
}
@ -41,7 +40,21 @@ static VmaAllocationCreateFlags vma_allocation_flags(GPUUsageType usage)
return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
}
bool VKBuffer::create(int64_t size_in_bytes, GPUUsageType usage, VkBufferUsageFlags buffer_usage)
static VkMemoryPropertyFlags vma_preferred_flags(const bool is_host_visible)
{
return is_host_visible ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT :
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
}
/*
* TODO: Check which memory is selected and adjust the creation flag to add mapping. This way the
* staging buffer can be skipped, or in case of a vertex buffer an intermediate buffer can be
* removed.
*/
bool VKBuffer::create(int64_t size_in_bytes,
GPUUsageType usage,
VkBufferUsageFlags buffer_usage,
const bool is_host_visible)
{
BLI_assert(!is_allocated());
BLI_assert(vk_buffer_ == VK_NULL_HANDLE);
@ -70,6 +83,7 @@ bool VKBuffer::create(int64_t size_in_bytes, GPUUsageType usage, VkBufferUsageFl
VmaAllocationCreateInfo vma_create_info = {};
vma_create_info.flags = vma_allocation_flags(usage);
vma_create_info.priority = 1.0f;
vma_create_info.preferredFlags = vma_preferred_flags(is_host_visible);
vma_create_info.usage = VMA_MEMORY_USAGE_AUTO;
VkResult result = vmaCreateBuffer(
@ -78,8 +92,10 @@ bool VKBuffer::create(int64_t size_in_bytes, GPUUsageType usage, VkBufferUsageFl
return false;
}
/* All buffers are mapped to virtual memory. */
return map();
if (is_host_visible) {
return map();
}
return true;
}
void VKBuffer::update(const void *data) const

View File

@ -31,8 +31,10 @@ class VKBuffer {
/** Has this buffer been allocated? */
bool is_allocated() const;
bool create(int64_t size, GPUUsageType usage, VkBufferUsageFlags buffer_usage);
bool create(int64_t size,
GPUUsageType usage,
VkBufferUsageFlags buffer_usage,
bool is_host_visible = true);
void clear(VKContext &context, uint32_t clear_value);
void update(const void *data) const;
void flush() const;
@ -56,9 +58,13 @@ class VKBuffer {
*/
void *mapped_memory_get() const;
/**
* Is this buffer mapped (visible on host)
*/
bool is_mapped() const;
private:
/** Check if this buffer is mapped. */
bool is_mapped() const;
bool map();
void unmap();
};

View File

@ -415,7 +415,9 @@ void VKCommandBuffers::copy(VKTexture &dst_texture,
command_buffer.command_recorded();
}
void VKCommandBuffers::copy(VKBuffer &dst_buffer, VkBuffer src_buffer, Span<VkBufferCopy> regions)
void VKCommandBuffers::copy(const VKBuffer &dst_buffer,
VkBuffer src_buffer,
Span<VkBufferCopy> regions)
{
VKCommandBuffer &command_buffer = command_buffer_get(Type::DataTransferCompute);
vkCmdCopyBuffer(command_buffer.vk_command_buffer(),

View File

@ -95,7 +95,7 @@ class VKCommandBuffers : public NonCopyable, NonMovable {
void copy(VKBuffer &dst_buffer, VKTexture &src_texture, Span<VkBufferImageCopy> regions);
void copy(VKTexture &dst_texture, VKBuffer &src_buffer, Span<VkBufferImageCopy> regions);
void copy(VKTexture &dst_texture, VKTexture &src_texture, Span<VkImageCopy> regions);
void copy(VKBuffer &dst_buffer, VkBuffer src_buffer, Span<VkBufferCopy> regions);
void copy(const VKBuffer &dst_buffer, VkBuffer src_buffer, Span<VkBufferCopy> regions);
void blit(VKTexture &dst_texture, VKTexture &src_texture, Span<VkImageBlit> regions);
void blit(VKTexture &dst_texture,
VkImageLayout dst_layout,

View File

@ -9,6 +9,7 @@
#include "vk_index_buffer.hh"
#include "vk_shader.hh"
#include "vk_shader_interface.hh"
#include "vk_staging_buffer.hh"
#include "vk_state_manager.hh"
namespace blender::gpu {
@ -24,10 +25,15 @@ void VKIndexBuffer::ensure_updated()
allocate();
}
if (data_ != nullptr) {
buffer_.update(data_);
MEM_SAFE_FREE(data_);
if (data_ == nullptr) {
return;
}
VKContext &context = *VKContext::get();
VKStagingBuffer staging_buffer(buffer_, VKStagingBuffer::Direction::HostToDevice);
staging_buffer.host_buffer_get().update(data_);
staging_buffer.copy_to_device(context);
MEM_SAFE_FREE(data_);
}
void VKIndexBuffer::upload_data()
@ -65,9 +71,9 @@ void VKIndexBuffer::bind(int binding,
void VKIndexBuffer::read(uint32_t *data) const
{
VKContext &context = *VKContext::get();
context.flush();
buffer_.read(data);
VKStagingBuffer staging_buffer(buffer_, VKStagingBuffer::Direction::DeviceToHost);
staging_buffer.copy_from_device(context);
staging_buffer.host_buffer_get().read(data);
}
void VKIndexBuffer::update_sub(uint /*start*/, uint /*len*/, const void * /*data*/)
@ -83,8 +89,11 @@ void VKIndexBuffer::strip_restart_indices()
void VKIndexBuffer::allocate()
{
GPUUsageType usage = data_ == nullptr ? GPU_USAGE_DEVICE_ONLY : GPU_USAGE_STATIC;
buffer_.create(
size_get(), usage, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
buffer_.create(size_get(),
usage,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
false);
debug::object_label(buffer_.vk_handle(), "IndexBuffer");
}

View File

@ -0,0 +1,57 @@
/* SPDX-FileCopyrightText: 2023 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup gpu
*/
#include "vk_staging_buffer.hh"
#include "vk_command_buffers.hh"
#include "vk_context.hh"
namespace blender::gpu {
VKStagingBuffer::VKStagingBuffer(const VKBuffer &device_buffer, Direction direction)
: device_buffer_(device_buffer)
{
VkBufferUsageFlags usage;
switch (direction) {
case Direction::HostToDevice:
usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
break;
case Direction::DeviceToHost:
usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;
}
host_buffer_.create(device_buffer.size_in_bytes(), GPU_USAGE_STREAM, usage, true);
}
void VKStagingBuffer::copy_to_device(VKContext &context)
{
BLI_assert(host_buffer_.is_allocated() && host_buffer_.is_mapped());
VkBufferCopy buffer_copy = {};
buffer_copy.size = device_buffer_.size_in_bytes();
VKCommandBuffers &command_buffers = context.command_buffers_get();
command_buffers.copy(
device_buffer_, host_buffer_.vk_handle(), Span<VkBufferCopy>(&buffer_copy, 1));
command_buffers.submit();
}
void VKStagingBuffer::copy_from_device(VKContext &context)
{
BLI_assert(host_buffer_.is_allocated() && host_buffer_.is_mapped());
VkBufferCopy buffer_copy = {};
buffer_copy.size = device_buffer_.size_in_bytes();
VKCommandBuffers &command_buffers = context.command_buffers_get();
command_buffers.copy(
host_buffer_, device_buffer_.vk_handle(), Span<VkBufferCopy>(&buffer_copy, 1));
command_buffers.submit();
}
void VKStagingBuffer::free()
{
host_buffer_.free();
}
} // namespace blender::gpu

View File

@ -0,0 +1,76 @@
/* SPDX-FileCopyrightText: 2023 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup gpu
*/
#pragma once
#include "vk_buffer.hh"
#include "vk_common.hh"
namespace blender::gpu {
/**
* Utility class to copy data from host to device and vise versa.
*
* This is a common as buffers on device are more performant than when located inside host memory.
*/
class VKStagingBuffer {
public:
/**
* Direction of the transfer.
*/
enum class Direction {
/**
* Transferring data from host to device.
*/
HostToDevice,
/**
* Transferring data from device to host.
*/
DeviceToHost,
};
private:
/**
* Reference to the device buffer.
*/
const VKBuffer &device_buffer_;
/**
* The temporary buffer on host for the transfer. Also called the staging buffer.
*/
VKBuffer host_buffer_;
public:
VKStagingBuffer(const VKBuffer &device_buffer, Direction direction);
/**
* Copy the content of the host buffer to the device buffer.
*/
void copy_to_device(VKContext &context);
/**
* Copy the content of the device buffer to the host buffer.
*/
void copy_from_device(VKContext &context);
/**
* Get the reference to the host buffer to update/load the data.
*/
const VKBuffer &host_buffer_get() const
{
return host_buffer_;
}
/**
* Free the host memory.
*
* In case a reference of the staging buffer is kept, but the host resource isn't needed anymore.
*/
void free();
};
} // namespace blender::gpu

View File

@ -7,6 +7,7 @@
*/
#include "vk_shader.hh"
#include "vk_shader_interface.hh"
#include "vk_staging_buffer.hh"
#include "vk_state_manager.hh"
#include "vk_vertex_buffer.hh"
@ -21,8 +22,11 @@ VKStorageBuffer::VKStorageBuffer(int size, GPUUsageType usage, const char *name)
void VKStorageBuffer::update(const void *data)
{
VKContext &context = *VKContext::get();
ensure_allocated();
buffer_.update(data);
VKStagingBuffer staging_buffer(buffer_, VKStagingBuffer::Direction::HostToDevice);
staging_buffer.host_buffer_get().update(data);
staging_buffer.copy_to_device(context);
}
void VKStorageBuffer::ensure_allocated()
@ -34,10 +38,12 @@ void VKStorageBuffer::ensure_allocated()
void VKStorageBuffer::allocate()
{
const bool is_host_visible = false;
buffer_.create(size_in_bytes_,
usage_,
VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT);
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
is_host_visible);
debug::object_label(buffer_.vk_handle(), name_);
}
@ -104,7 +110,9 @@ void VKStorageBuffer::read(void *data)
VKContext &context = *VKContext::get();
context.flush();
buffer_.read(data);
VKStagingBuffer staging_buffer(buffer_, VKStagingBuffer::Direction::DeviceToHost);
staging_buffer.copy_from_device(context);
staging_buffer.host_buffer_get().read(data);
}
} // namespace blender::gpu

View File

@ -10,6 +10,7 @@
#include "vk_context.hh"
#include "vk_shader.hh"
#include "vk_shader_interface.hh"
#include "vk_staging_buffer.hh"
#include "vk_state_manager.hh"
namespace blender::gpu {
@ -19,15 +20,30 @@ void VKUniformBuffer::update(const void *data)
if (!buffer_.is_allocated()) {
allocate();
}
buffer_.update(data);
VKContext &context = *VKContext::get();
if (buffer_.is_mapped()) {
buffer_.update(data);
}
else {
VKStagingBuffer staging_buffer(buffer_, VKStagingBuffer::Direction::HostToDevice);
staging_buffer.host_buffer_get().update(data);
staging_buffer.copy_to_device(context);
}
}
void VKUniformBuffer::allocate()
{
/*
* TODO: make uniform buffers device local. In order to do that we should remove the upload
* during binding, as that will reset the graphics pipeline and already attached resources would
* not be bound anymore.
*/
const bool is_host_visible = true;
buffer_.create(size_in_bytes_,
GPU_USAGE_STATIC,
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT);
VK_BUFFER_USAGE_TRANSFER_DST_BIT,
is_host_visible);
debug::object_label(buffer_.vk_handle(), name_);
}

View File

@ -114,6 +114,15 @@ void VKVertexAttributeObject::bind_buffers(VKContext &context)
}
}
void VKVertexAttributeObject::ensure_vbos_uploaded() const
{
for (VKVertexBuffer *vbo : vbos) {
if (vbo) {
vbo->upload();
}
}
}
/** \} */
/* -------------------------------------------------------------------- */

View File

@ -47,6 +47,14 @@ class VKVertexAttributeObject {
void update_bindings(const VKContext &context, VKBatch &batch);
void update_bindings(VKImmediate &immediate);
/**
* Ensure that all Vertex Buffers are uploaded to the GPU.
*
* This is a separate step as uploading could flush the graphics pipeline making the state
* inconsistent.
*/
void ensure_vbos_uploaded() const;
void debug_print() const;
private:

View File

@ -12,6 +12,7 @@
#include "vk_memory.hh"
#include "vk_shader.hh"
#include "vk_shader_interface.hh"
#include "vk_staging_buffer.hh"
#include "vk_state_manager.hh"
#include "vk_vertex_buffer.hh"
@ -92,7 +93,14 @@ void VKVertexBuffer::read(void *data) const
{
VKContext &context = *VKContext::get();
context.flush();
buffer_.read(data);
if (buffer_.is_mapped()) {
buffer_.read(data);
return;
}
VKStagingBuffer staging_buffer(buffer_, VKStagingBuffer::Direction::DeviceToHost);
staging_buffer.copy_from_device(context);
staging_buffer.host_buffer_get().read(data);
}
void VKVertexBuffer::acquire_data()
@ -128,6 +136,25 @@ void VKVertexBuffer::release_data()
MEM_SAFE_FREE(data);
}
void VKVertexBuffer::upload_data_direct(const VKBuffer &host_buffer)
{
device_format_ensure();
if (vertex_format_converter.needs_conversion()) {
vertex_format_converter.convert(host_buffer.mapped_memory_get(), data, vertex_len);
host_buffer.flush();
}
else {
host_buffer.update(data);
}
}
void VKVertexBuffer::upload_data_via_staging_buffer(VKContext &context)
{
VKStagingBuffer staging_buffer(buffer_, VKStagingBuffer::Direction::HostToDevice);
upload_data_direct(staging_buffer.host_buffer_get());
staging_buffer.copy_to_device(context);
}
void VKVertexBuffer::upload_data()
{
if (!buffer_.is_allocated()) {
@ -139,12 +166,12 @@ void VKVertexBuffer::upload_data()
if (flag & GPU_VERTBUF_DATA_DIRTY) {
device_format_ensure();
if (vertex_format_converter.needs_conversion()) {
vertex_format_converter.convert(buffer_.mapped_memory_get(), data, vertex_len);
buffer_.flush();
if (buffer_.is_mapped()) {
upload_data_direct(buffer_);
}
else {
buffer_.update(data);
VKContext &context = *VKContext::get();
upload_data_via_staging_buffer(context);
}
if (usage_ == GPU_USAGE_STATIC) {
MEM_SAFE_FREE(data);
@ -175,10 +202,15 @@ const GPUVertFormat &VKVertexBuffer::device_format_get() const
void VKVertexBuffer::allocate()
{
buffer_.create(size_alloc_get(),
usage_,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT);
const bool is_host_visible = ELEM(usage_, GPU_USAGE_DYNAMIC, GPU_USAGE_STREAM);
VkBufferUsageFlags vk_buffer_usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
if (!is_host_visible) {
vk_buffer_usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
}
buffer_.create(size_alloc_get(), usage_, vk_buffer_usage, is_host_visible);
debug::object_label(buffer_.vk_handle(), "VertexBuffer");
}

View File

@ -61,6 +61,9 @@ class VKVertexBuffer : public VertBuf, public VKBindableResource {
private:
void allocate();
void upload_data_direct(const VKBuffer &host_buffer);
void upload_data_via_staging_buffer(VKContext &context);
/* VKTexture requires access to `buffer_` to convert a vertex buffer to a texture. */
friend class VKTexture;
};